# 数据透视表 我们目前所用的累计操作都是按照一个维度进行,数据透视表可以看做是按照二维进行累计的操作功能。 ```python # 以泰坦尼克号数据为例子进行展示 import numpy as np import pandas as pd import seaborn as sns titanic = sns.load_dataset("titanic") print(titanic.shape) ``` (891, 15) ## 数据透视表的初步使用 ```python # 先进行粗分类 a = titanic.groupby("sex")[['survived']].mean() # 可以看出,女性获救比例大概是男性的4倍多 print(a) ``` survived sex female 0.742038 male 0.188908 ```python # 尝试按sex,class分组,然后统计逃生人数,求mean后使用层级索引 # 可以清晰的展示出,逃生人数受sex,class 的影响 a = titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack() print(a) ``` class First Second Third sex female 0.968085 0.921053 0.500000 male 0.368852 0.157407 0.135447 ## pivot_table pivot_table实现的效果等同于上一届的管道命令,是一个简写。 ```python # 尝试按sex,class分组,然后统计逃生人数,求mean后使用层级索引 a = titanic.pivot_table('survived', index='sex', columns='class') print(a) ``` class First Second Third sex female 0.968085 0.921053 0.500000 male 0.368852 0.157407 0.135447 ## 创建多级索引形状的DataFrame结果 ```python # 按照sex,age,class统计,age分三个年龄段,0-18-80 # 使用cut函数对年龄进行分段 # cut适用于明确知道分界值的时候进行分类,具体落在每个分界内的数据数量不一定 age = pd.cut(titanic['age'], [0,18,80]) a = titanic.pivot_table('survived', ['sex', age], 'class') print(a) ``` class First Second Third sex age female (0, 18] 0.909091 1.000000 0.511628 (18, 80] 0.972973 0.900000 0.423729 male (0, 18] 0.800000 0.600000 0.215686 (18, 80] 0.375000 0.071429 0.133663 ```python # 对列也可以使用类似策略 # 使用qcut对票价进行划分成两部分,每一部分人数相等 # qcut把数据分成两部分,一般是等分 fare = pd.qcut(titanic['fare'], 2) a = titanic.pivot_table('survived', ['sex', age], [fare, 'class']) print(a) ``` fare (-0.001, 14.454] (14.454, 512.329] \ class First Second Third First sex age female (0, 18] NaN 1.000000 0.714286 0.909091 (18, 80] NaN 0.880000 0.444444 0.972973 male (0, 18] NaN 0.000000 0.260870 0.800000 (18, 80] 0.0 0.098039 0.125000 0.391304 fare class Second Third sex age female (0, 18] 1.000000 0.318182 (18, 80] 0.914286 0.391304 male (0, 18] 0.818182 0.178571 (18, 80] 0.030303 0.192308