分组统计 - groupby功能
- 根据某些条件将数据拆分成组
- 对每个组独立应用函数
- 将结果合并到一个数据结构中
Dataframe在行(axis=0)或列(axis=1)上进行分组,将一个函数应用到各个分组并产生一个新值,然后函数执行结果被合并到最终的结果对象中。
df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)
1.
# 分组
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
print(df)
print('------')
print(df.groupby('A'), type(df.groupby('A')))
print('------')
# 直接分组得到一个groupby对象,是一个中间数据,没有进行计算
a = df.groupby('A').mean() # (0.494839+0.977357 -0.079595)/3
b = df.groupby(['A','B']).mean()
c = df.groupby(['A'])['D'].mean() # 以A分组,但是只计算D的平均值
print(a,type(a),'\n',a.columns)
print(b,type(b),'\n',b.columns)
print(c,type(c))
# 通过分组后的计算,得到一个新的dataframe
# 默认axis = 0,以行来分组
# 可单个或多个([])列分组
输出结果:
A B C D
0 foo one 0.539903 -0.291392
1 bar one 0.243375 1.093706
2 foo two -0.552425 -0.333666
3 bar three 0.307315 -0.094833
4 foo two -1.011648 -0.856448
5 bar two 1.078264 1.590439
6 foo one 0.550491 -0.044095
7 foo three 0.162069 0.445236
------
<pandas.core.groupby.DataFrameGroupBy object at 0x000001DCA7B527B8> <class 'pandas.core.groupby.DataFrameGroupBy'>
------
C D
A
bar 0.542985 0.863104
foo -0.062322 -0.216073 <class 'pandas.core.frame.DataFrame'>
Index(['C', 'D'], dtype='object')
C D
A B
bar one 0.243375 1.093706
three 0.307315 -0.094833
two 1.078264 1.590439
foo one 0.545197 -0.167744
three 0.162069 0.445236
two -0.782036 -0.595057 <class 'pandas.core.frame.DataFrame'>
Index(['C', 'D'], dtype='object')
A
bar 0.863104
foo -0.216073
Name: D, dtype: float64 <class 'pandas.core.series.Series'>
2.
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
b = df.groupby(['A','B']).mean()
b.reset_index(inplace = True)
print(b)
输出结果:
A B C D
0 bar one -1.501298 -1.450693
1 bar three -0.623903 0.832721
2 bar two 1.264561 0.265831
3 foo one 0.996789 0.645541
4 foo three 1.338508 -1.213098
5 foo two 0.934279 -1.139260
3.
# 分组 - 可迭代对象
df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print(df.groupby('X'), type(df.groupby('X'))) #输出的类型为可迭代的对象
print('-----')
print(list(df.groupby('X')), '→ 可迭代对象,直接生成list\n')
print(list(df.groupby('X'))[0], '→ 以元祖形式显示\n')
for n,g in df.groupby('X'):
print(n)
print(g)
print('###')
print('-----')
# n是组名,g是分组后的Dataframe
print(df.groupby(['X']).get_group('A'),'\n')
print(df.groupby(['X']).get_group('B'),'\n')
print('-----')
# .get_group()提取分组后的组
grouped = df.groupby(['X'])
print(grouped.groups,'\n')
print(grouped.groups['A']) # 也可写:df.groupby('X').groups['A']
print('-----')
# .groups:将分组后的groups转为dict
# 可以字典索引方法来查看groups里的元素
sz = grouped.size()
print(sz,type(sz))
print('-----')
# .size():查看分组后的长度
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
grouped = df.groupby(['A','B']).groups
print(df,'\n')
print(grouped,'\n')
print(grouped[('foo', 'three')])
# 按照两个列进行分组
输出结果:
X Y
0 A 1
1 B 4
2 A 3
3 B 2
<pandas.core.groupby.DataFrameGroupBy object at 0x000001DCA7BA6908> <class 'pandas.core.groupby.DataFrameGroupBy'>
-----
[('A', X Y
0 A 1
2 A 3), ('B', X Y
1 B 4
3 B 2)] → 可迭代对象,直接生成list
('A', X Y
0 A 1
2 A 3) → 以元祖形式显示
A
X Y
0 A 1
2 A 3
###
B
X Y
1 B 4
3 B 2
###
-----
X Y
0 A 1
2 A 3
X Y
1 B 4
3 B 2
-----
{'A': Int64Index([0, 2], dtype='int64'), 'B': Int64Index([1, 3], dtype='int64')}
Int64Index([0, 2], dtype='int64')
-----
X
A 2
B 2
dtype: int64 <class 'pandas.core.series.Series'>
-----
A B C D
0 foo one 0.185932 -0.128426
1 bar one 1.205172 0.860480
2 foo two 0.965735 1.008437
3 bar three 0.442906 0.065548
4 foo two 0.461985 -1.591069
5 bar two -0.917835 -0.707424
6 foo one 0.537916 -0.031545
7 foo three 0.838332 -0.804244
{('foo', 'two'): Int64Index([2, 4], dtype='int64'), ('bar', 'one'): Int64Index([1], dtype='int64'), ('bar', 'two'): Int64Index([5], dtype='int64'), ('foo', 'three'): Int64Index([7], dtype='int64'), ('foo', 'one'): Int64Index([0, 6], dtype='int64'), ('bar', 'three'): Int64Index([3], dtype='int64')}
Int64Index([7], dtype='int64')
4.
df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print(df.groupby('X'), type(df.groupby('X'))) #输出的类型为可迭代的对象
print('-----')
print(list(df.groupby('X')), '→ 可迭代对象,直接生成list\n')
print(list(df.groupby('X'))[0], '→ 以元祖形式显示\n')
tup = list(df.groupby('X'))[0]
print(tup[0],type(tup[0]))
print(tup[1],type(tup[1]))
输出结果:
X Y
0 A 1
1 B 4
2 A 3
3 B 2
<pandas.core.groupby.DataFrameGroupBy object at 0x000001DCA7B87A20> <class 'pandas.core.groupby.DataFrameGroupBy'>
-----
[('A', X Y
0 A 1
2 A 3), ('B', X Y
1 B 4
3 B 2)] → 可迭代对象,直接生成list
('A', X Y
0 A 1
2 A 3) → 以元祖形式显示
A <class 'str'>
X Y
0 A 1
2 A 3 <class 'pandas.core.frame.DataFrame'>
5.
# 分组计算函数方法
s = pd.Series([1, 2, 3, 10, 20, 30], index = [1, 2, 3, 1, 2, 3])
grouped = s.groupby(level=0) # 唯一索引用.groupby(level=0),将同一个index的分为一组 当用index做分组的时候,用level
print(grouped)
print(grouped.first(),'→ first:非NaN的第一个值\n')
print(grouped.last(),'→ last:非NaN的最后一个值\n')
print(grouped.sum(),'→ sum:非NaN的和\n')
print(grouped.mean(),'→ mean:非NaN的平均值\n')
print(grouped.median(),'→ median:非NaN的算术中位数\n')
print(grouped.count(),'→ count:非NaN的值\n')
print(grouped.min(),'→ min、max:非NaN的最小值、最大值\n')
print(grouped.std(),'→ std,var:非NaN的标准差和方差\n')
print(grouped.prod(),'→ prod:非NaN的积\n')
输出结果:
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000002B38F39B4E0>
1 1
2 2
3 3
dtype: int64 → first:非NaN的第一个值
1 10
2 20
3 30
dtype: int64 → last:非NaN的最后一个值
1 11
2 22
3 33
dtype: int64 → sum:非NaN的和
1 5.5
2 11.0
3 16.5
dtype: float64 → mean:非NaN的平均值
1 5.5
2 11.0
3 16.5
dtype: float64 → median:非NaN的算术中位数
1 2
2 2
3 2
dtype: int64 → count:非NaN的值
1 1
2 2
3 3
dtype: int64 → min、max:非NaN的最小值、最大值
1 6.363961
2 12.727922
3 19.091883
dtype: float64 → std,var:非NaN的标准差和方差
1 10
2 40
3 90
dtype: int64 → prod:非NaN的积
6.
多函数计算:agg()
df = pd.DataFrame({'a':[1,1,2,2],
'b':np.random.rand(4),
'c':np.random.rand(4),
'd':np.random.rand(4),})
print(df)
print(df.groupby('a').agg(['mean',np.sum])) #计算一个均值和一个求和 会把b,c,d的每一列都会计算一个mean和sum
print(df.groupby('a')['b'].agg({'result1':np.mean,
'result2':np.sum})) #按a分组后b这一列的均值和求和
# 函数写法可以用str,或者np.方法
# 可以通过list,dict传入,当用dict时,key名为columns → 更新pandas后会出现警告
# 尽量用list传入
输出结果:
a b c d
0 1 0.758848 0.375900 0.962917
1 1 0.430484 0.322437 0.402809
2 2 0.285699 0.230663 0.525483
3 2 0.676740 0.191693 0.874899
b c d
mean sum mean sum mean sum
a
1 0.594666 1.189331 0.349169 0.698337 0.682863 1.365727
2 0.481219 0.962438 0.211178 0.422356 0.700191 1.400382
result1 result2
a
1 0.594666 1.189331
2 0.481219 0.962438
C:\Users\__main__.py:10: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
练习题:
作业1:按要求创建Dataframe df,并通过分组得到以下结果
① 以A分组,求出C,D的分组平均值
② 以A,B分组,求出D,E的分组求和
③ 以A分组,得到所有分组,以字典显示
④ 按照数值类型分组,求和
⑤ 将C,D作为一组分出来,并计算求和
⑥ 以B分组,求出每组的均值,求和,最大值,最小值
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df = pd.DataFrame({'A':['one','two','three','one','two','three','one','two'],
'B':list('hhhhffff'),
'C':list(range(10,25,2)),
'D':np.random.rand(8),
'E':np.random.rand(8)})
#(1)
print(df.groupby(by = 'A')[['D','E']].mean())
#(2)
print(df.groupby(by = ['A','B'])[['D','E']].sum())
#(3)
g = df.groupby(by = 'A')
for n,i in g:
print(n)
print(i)
#(4)
print(df.groupby(df.dtypes,axis = 1).sum())
#(5)
df2 = df[['C','D']]
print(df2)
df2['sum'] = df2.sum(axis = 1)
print(df2)
#(6)
print(df.groupby('B').agg(['mean','sum','max',np.min]))
输出结果:
D E
A
one 0.371521 0.524208
three 0.549758 0.513263
two 0.407525 0.511265
D E
A B
one f 0.505448 0.961745
h 0.609115 0.610880
three f 0.856872 0.495862
h 0.242644 0.530664
two f 0.679124 1.358688
h 0.543450 0.175106
one
A B C D E
0 one h 10 0.258020 0.411822
3 one h 16 0.351095 0.199058
6 one f 22 0.505448 0.961745
three
A B C D E
2 three h 14 0.242644 0.530664
5 three f 20 0.856872 0.495862
two
A B C D E
1 two h 12 0.543450 0.175106
4 two f 18 0.010620 0.919586
7 two f 24 0.668504 0.439102
int64 float64 object
0 10 0.669841 oneh
1 12 0.718555 twoh
2 14 0.773309 threeh
3 16 0.550153 oneh
4 18 0.930206 twof
5 20 1.352733 threef
6 22 1.467194 onef
7 24 1.107606 twof
C D
0 10 0.258020
1 12 0.543450
2 14 0.242644
3 16 0.351095
4 18 0.010620
5 20 0.856872
6 22 0.505448
7 24 0.668504
C D sum
0 10 0.258020 10.258020
1 12 0.543450 12.543450
2 14 0.242644 14.242644
3 16 0.351095 16.351095
4 18 0.010620 18.010620
5 20 0.856872 20.856872
6 22 0.505448 22.505448
7 24 0.668504 24.668504
C D E \
mean sum max amin mean sum max amin mean
B
f 21 84 24 18 0.510361 2.041444 0.856872 0.010620 0.704074
h 13 52 16 10 0.348802 1.395209 0.543450 0.242644 0.329162
sum max amin
B
f 2.816295 0.961745 0.439102
h 1.316650 0.530664 0.175106