通过聚合运算可以得到我们比较感兴趣的数据以方便处理
import pandas as pd
import numpy as np # 先创建一组数据表DataFrame
df = pd.DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randint(1,10,5),
'data2':np.random.randint(1,10,5)})
# 这组数据的key1列与key2列有相同的有各自相同的索引行 df.groupby('key1').describe()
# 通过这一行我们可以看到关于key1列分组后的一些计算结果,
# count mean std min 25% 50% 75% max grouped = df.groupby('key1')
# 它返回的是DataFrom对象 def peak_range(s):
print(type(s))
return s.max() - s.min() # 接下来就是看看聚合的应用了
grouped.agg(['std','mean','sum',('range',peak_range)])
# 这个函数聚合了df.groupby('key1').describe()所展现的一些计算结果
# ('range',peak_range)中的range是peak_range函数计算结果的列名 # 通过创建字典进行聚合自己想要的部分数据
d = {'data1':'mean',
'data2':'sum'}
grouped.agg(d) d = {'data1':['mean',('range',peak_range)],
'data2':'sum'}
grouped.agg(d)
grouped.agg(d).reset_index() # 不把key1作为索引来处理
df.groupby('key1',as_index=False).agg(d) # 与上一行效果一样
import pandas as pd
import numpy as np # 先创建一个DataFrame
df = pd.DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randint(1,10,5),
'data2':np.random.randint(1,10,5)})
# 合并方式一
k1_mean = df.groupby('key1').mean().add_prefix('mean_')
# 将key1分组取均值,并将data1与data2之前加上mean_
pd.merge(df, k1_mean, left_on='key1',right_index=True) # 通过merge合并 k1_mean = df.groupby('key1').transform(np.mean).add_prefix('mean_')
# 它通过分组求平均,保持原来的索引位置与行数
# 这样可以通过下面的代码实现合并
df[k1_mean.columns] = k1_mean
import pandas as pd
import numpy as np df = pd.DataFrame(np.random.randint(1,10,(5,5)),
columns=['a','b','c','d','e'],
index=['Alice','Bob','Candy','Dark','Emily']) def demean(s):
return s - s.mean() key = ['one','one','two','one','two']
demeaned = df.groupby(key).transform(demean) demeaned.groupby(key).mean() # 输出的值都是0或者接近0
import pandas as pd
import numpy as np df = pd.DataFrame({'key1':['a','a','b','b','a','a','a','b','b','a'],
'key2':['one','two','one','two','one','one','two','one','two','one'],
'data1':np.random.randint(1,10,10),
'data2':np.random.randint(1,10,10)}) def top(g, n=2, column='data1'):
return g.sort_values(by=column,ascending=False)[:n] df.groupby('key1').apply(top, n=3, column='data2')
# 分组后只对data2列的值排序取出前三行
import pandas as pd
import numpy as np # 下面的例子是填充NaN值的方法 states = ['Ohio','New York','Vermont','Florida',
'Oregon','Nevada','California','Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index=states)
data[['Vermont','Nevada','Idaho']] = np.nan
# 输出
Ohio 0.133410
New York 2.147483
Vermont NaN
Florida -0.608754
Oregon 0.978375
Nevada NaN
California -1.297183
Idaho NaN
dtype: float64 data.groupby(group_key).mean()
# 输出
East 0.557380
West -0.159404
dtype: float64 data.groupby(group_key).apply(lambda g: g.fillna(g.mean()))
# 输出
Ohio 0.133410
New York 2.147483
Vermont 0.557380
Florida -0.608754
Oregon 0.978375
Nevada -0.159404
California -1.297183
Idaho -0.159404
dtype: float64