数据库风格的dataframe合并 merge
df1 = DataFrame({'key':['b','b','a','c','a','a','b'],"data1":range(7)})
df2 = DataFrame({'key':['a','b','d'],"data2":range(3)})
df1
# data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
df2
# data2 key
0 0 a
1 1 b
2 2 d
pd.merge(df1,df2,on="key") #如果两个列名不一样可以用left_on和right_on指定左右的列名
# data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
pd.merge(df1,df2,on="key",how = "left")
# data1 key data2
0 0 b 1.0
1 1 b 1.0
2 2 a 0.0
3 3 c NaN
4 4 a 0.0
5 5 a 0.0
6 6 b 1.0
pd.merge(df1,df2,on="key",how = "outer") #outer是left和right的并集
# data1 key data2
0 0.0 b 1.0
1 1.0 b 1.0
2 6.0 b 1.0
3 2.0 a 0.0
4 4.0 a 0.0
5 5.0 a 0.0
6 3.0 c NaN
7 NaN d 2.0
如果有两个重复的列名可以用suffixes用于指定两个重复列名的名字的附加字符串
比如pd.merge(df1,df2,on=”key”,suffixes = (“_left”,”_right”))
索引上的合并
如果需要用索引当做链接键,可以使用left_index = True 或 right_index = True 或者两个都传
left1 = DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
right1 = DataFrame({'goupe_val':[3.5,7]},index = ['a','b'])
left1
# key value
0 a 0
1 b 1
2 a 2
3 a 3
4 b 4
5 c 5
right1
#
goupe_val
a 3.5
b 7.0
pd.merge(left1,right1,left_on = "key",right_index = True)
#
key value goupe_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
"""对于层次化索引 """
lefth=DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],
'key2':[2000,2001,2002,2001,2002],'data':np.arange(5.0)})
righth=DataFrame(np.arange(12).reshape((6,2)),index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],
[2001,2000,2000,2000,2001,2002]],columns=['event1','event2'])
lefth
# data key1 key2
0 0.0 Ohio 2000
1 1.0 Ohio 2001
2 2.0 Ohio 2002
3 3.0 Nevada 2001
4 4.0 Nevada 2002
righth
# event1 event2
Nevada 2001 0 1
2000 2 3
Ohio 2000 4 5
2000 6 7
2001 8 9
2002 10 11
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)
#
data key1 key2 event1 event2
0 0.0 Ohio 2000 4 5
0 0.0 Ohio 2000 6 7
1 1.0 Ohio 2001 8 9
2 2.0 Ohio 2002 10 11
3 3.0 Nevada 2001 0 1
轴向连接
另一种合并运算也叫作连接或堆叠,numpy有一个原始合并的函数 叫concatenate
arr = np.arange(12).reshape(3,4)
np.concatenate([arr,arr],axis = 1)
#array([[ 0, 1, 2, 3, 0, 1, 2, 3],
[ 4, 5, 6, 7, 4, 5, 6, 7],
[ 8, 9, 10, 11, 8, 9, 10, 11]])
对于pandas有一个concat函数
我们先来看Series
s1 = Series([0,1],index = ['a','b'])
s2 = Series([2,3,4],index = ['c','d','e'])
s3 = Series([5,6],index = ['f','g'])
pd.concat([s1,s2,s3])
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64 #这里如果设置axis= 1 会变成一个dataframe
s4 = pd.concat([s1*5,s3])
s4
#a 0
b 5
f 5
g 6
dtype: int64
pd.concat([s1,s4],axis = 1)
# 0 1
a 0.0 0
b 1.0 5
f NaN 5
g NaN 6
pd.concat([s1,s4],axis =1 ,join = "inner")
# 0 1
a 0 0
b 1 5
pd.concat([s1,s4],axis=1,join_axes=[['a','c','b','e']])
# 0 1
a 0.0 0.0
c NaN NaN
b 1.0 5.0
e NaN NaN
#可以通过join_axes指定要在其他轴上使用的索引
#如果想在连接轴上创建一个层次化索引,使用keys参数即可达到这个目的
result=pd.concat([s1,s1,s3],keys=['one','two','three'])
result
#one a 0
b 1
two a 0
b 1
three f 5
g 6
dtype: int64
result.unstack()
# a b f g
one 0.0 1.0 NaN NaN
two 0.0 1.0 NaN NaN
three NaN NaN 5.0 6.0
#如果沿着axis=1对Series合并,则keys就会成为DataFrame的列头
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])
one two three
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
#同样的逻辑对DataFrame对象也是一样
df1=DataFrame(np.arange(6).reshape((3,2)),index=['a','b','c'],columns=['one','two'])
df2=DataFrame(5+np.arange(4).reshape((2,2)),index=['a','c'],columns=['three','four'])
pd.concat([df1,df2],axis=1,keys=['level1','level2'])
#
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
#如果传入的不是列表而是一个字典,则字典的键会被当做keys选项的值
pd.concat({'level1':df1,'level2':df2},axis=1)
#此外还有连个管理层次化索引创建方式的参数
pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower'])
#upper level1 level2
lower one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
#考虑跟当前分析工作无关的DataFrame行索引
df1=DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df2=DataFrame(np.random.randn(2,3),columns=['b','d','a'])
df1
# a b c d
0 0.542878 0.891101 -0.921924 0.298668
1 -0.307432 0.145520 -0.842926 -0.576387
2 0.675326 -0.602455 0.802425 -0.267735
df2
# b d a
0 -0.476512 0.685691 1.712411
1 2.065149 1.298324 0.469601
pd.concat([df1,df2])
#
a b c d
0 0.542878 0.891101 -0.921924 0.298668
1 -0.307432 0.145520 -0.842926 -0.576387
2 0.675326 -0.602455 0.802425 -0.267735
0 1.712411 -0.476512 NaN 0.685691
1 0.469601 2.065149 NaN 1.298324
#在这种情况下,传入ignore_index=True即可
pd.concat([df1,df2],ignore_index=True)
#
a b c d
0 0.542878 0.891101 -0.921924 0.298668
1 -0.307432 0.145520 -0.842926 -0.576387
2 0.675326 -0.602455 0.802425 -0.267735
3 1.712411 -0.476512 NaN 0.685691
4 0.469601 2.065149 NaN 1.298324
#Series有一个combine_first方法,实现的也是一样的功能,而且会进行数据对齐
b[:-2].combine_first(a[2:])
#a NaN
b 4.5
c 3.0
d 2.0
e 1.0
f 0.0
dtype: float64
#对于DataFrame,combine_first自然也会在列上做同样的事情,因此你可以将其看做:用参数对象的数据为调用者对象的缺失数据‘打补丁’
df1=DataFrame({'a':[1.,np.nan,5.,np.nan],
'b':[np.nan,2.,np.nan,6.],
'c':range(2,18,4)})
df2=DataFrame({'a':[5.,4.,np.nan,3.,7.],
'b':[np.nan,3.,4.,6.,8.]})
df1.combine_first(df2)
a b c
0 1.0 NaN 2.0
1 4.0 2.0 6.0
2 5.0 4.0 10.0
3 3.0 6.0 14.0
4 7.0 8.0 NaN
移除重复数据
data=DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]})
data
# k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
data.duplicated()
#0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
data.drop_duplicates()
#
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4
映射
data=DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
'ounces':[4,3,12,6,7.5,8,3,5,6]})
data
# food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon 12.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0
meat_to_animal={'bacon':'pig','pulled pork':'pig','pastrami':'cow'
,'corned beef':'cow','honey ham':'pig','nova lox':'salmon'}
data['meat']= data['food'].map(str.lower).map(meat_to_animal)
data
# food ounces meat
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
替换值
data=Series([1.,-999.,2.,-999.,-1000.,3.])
data
#0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
data.replace(-999,np.nan)
#一次性替换多个值,可以传入一个由待替换值组成的列表及一个替换值
data.replace([-999,-1000],np.nan)
#如果希望对不同的值进行不同的替换,则传入一个由替换关系组成的列表即可
data.replace([-999,-1000],[np.nan,0])
#传入的参数也可以是字典
data.replace({-999:np.nan,-1000:0})
重命名轴索引
轴标签也可以通过map映射进行转换
df = DataFrame(np.arange(12).reshape(3,4),index = ['Ohio','Colonado',"New York"],columns = ['one','two','three','four'])
df
# one two three four
Ohio 0 1 2 3
Colonado 4 5 6 7
New York 8 9 10 11
df.index.map(str.upper) #只修改index
df.index = df.index.map(str.upper) #修改了index
df.rename(index = str.title,columns = str.upper) #最好还是rename 不直接修改 想直接修改原文本可以加上inplace =True
df.rename(index = {'Ohio':'OHIO'},columns={"one":"One"},inplace = True) #也可以传入字典
离散化和面元划分
连续数据有时候经常需要被拆分成离散数据或面元(bins)
ages=[20,22,25,27,21,23,37,31,61,45,41,32]
bins=[18,25,35,60,100]
cats=pd.cut(ages,bins)
cats
#[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.categories
#IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
closed='right',
dtype='interval[int64]')
pd.value_counts(cats)
#(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
#跟区间的数学符号一样,圆括号表示开端,方括号表示闭端(包括)那边是闭端可以通过right=False进行修改
pd.cut(ages,[18,26,36,61,100],right=False)
#[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
#你也可以设置自己的面元名称,将labels选项设置为一个列表或数组即可
group_names=['Youth','YoungAdult','MiddleAged','Senior']
pd.cut(ages,bins,labels=group_names)
Out[6]:
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
In [7]:
#如果向cut传入的是面元的数量而不是确切的面元边界,则它会根据数据的最小值和最大值计算等长面元。
data=np.random.randn(20)
pd.cut(data,4,precision=2)
Out[7]:
[(2.14, 3.44], (0.83, 2.14], (-0.47, 0.83], (-1.78, -0.47], (-1.78, -0.47], ..., (2.14, 3.44], (0.83, 2.14], (-1.78, -0.47], (-0.47, 0.83], (-0.47, 0.83]]
Length: 20
Categories (4, interval[float64]): [(-1.78, -0.47] < (-0.47, 0.83] < (0.83, 2.14] < (2.14, 3.44]]
In [8]:
#qcut是一个非常类似cut的函数,它可以根据样本分位数对数据进行面元划分。根据数据的分布情况,cut可能无法使各个面元
# 中含有相同的数据点,而qcut由于使用的是样本分位数,因此可以得到大小基本相等的面元
data=np.random.randn(1000)#正态分布
cats=pd.qcut(data,4)#按四分位数进行切割
cats
Out[8]:
[(0.0313, 0.682], (-0.651, 0.0313], (0.0313, 0.682], (0.682, 3.171], (-3.177, -0.651], ..., (0.682, 3.171], (-3.177, -0.651], (-0.651, 0.0313], (-3.177, -0.651], (0.0313, 0.682]]
Length: 1000
Categories (4, interval[float64]): [(-3.177, -0.651] < (-0.651, 0.0313] < (0.0313, 0.682] < (0.682, 3.171]]
In [9]:
pd.value_counts(cats)
Out[9]:
(0.682, 3.171] 250
(0.0313, 0.682] 250
(-0.651, 0.0313] 250
(-3.177, -0.651] 250
dtype: int64
In [10]:
#跟cut一样,可以设置自定义的分位数
pd.qcut(data,[0,0.1,0.5,0.9,1.])
Out[10]:
[(0.0313, 1.281], (-1.286, 0.0313], (0.0313, 1.281], (1.281, 3.171], (-1.286, 0.0313], ..., (0.0313, 1.281], (-3.177, -1.286], (-1.286, 0.0313], (-3.177, -1.286], (0.0313, 1.281]]
Length: 1000
Categories (4, interval[float64]): [(-3.177, -1.286] < (-1.286, 0.0313] < (0.0313, 1.281] < (1.281, 3.171]]
检测和过滤异常值
np.random.seed(12345)
data=DataFrame(np.random.randn(1000,4))
#要选出全部含有“超过3或-3的值”的行,你可以利用布尔型DataFrame以及any方法
data[(np.abs(data)>3).any(1)]
#将值限制在区间-3到3以内
data[np.abs(data)>3]=np.sign(data)*3#np.sign这个ufunc返回的是一个由1和-1组成的数组,表示 原始值的符号
排列和随机采样
利用numpy.ranodm.permutation函数可以轻松实现对Series或DataFrame的列的排列工作(permuting,随机重排序)
df=DataFrame(np.arange(5*4).reshape(5,4))
sampler=np.random.permutation(5)
sampler
Out[51]:
array([3, 2, 4, 0, 1])
In [53]:
df.take(sampler)
Out[53]:
0 1 2 3
3 12 13 14 15
2 8 9 10 11
4 16 17 18 19
0 0 1 2 3
1 4 5 6 7
In [56]:
5
df.take(np.random.permutation(len(df))[:5])
Out[56]:
0 1 2 3
2 8 9 10 11
0 0 1 2 3
3 12 13 14 15
4 16 17 18 19
1 4 5 6 7
In [57]:
df
Out[57]:
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
4 16 17 18 19
In [58]:
#要通过替换的方式产生样本,最快的方式是通过np.random.randint得到一组随机整数
bag=np.array([5,7,-1,6,4])
sampler=np.random.randint(0,len(bag),size=10)
sampler
Out[58]:
array([1, 2, 0, 2, 1, 4, 2, 4, 0, 4])
In [59]:
draws=bag.take(sampler)
draws
Out[59]:
array([ 7, -1, 5, -1, 7, 4, -1, 4, 5, 4])
计算指标,哑变量
df=DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
df
Out[60]:
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 b
In [61]:
pd.get_dummies(df['key'])
Out[61]:
a b c
0 0 1 0
1 0 1 0
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0
In [65]:
#给指标DataFrame的列加上一个前缀,以便能够跟其他数据进行合并。get_dummies的prefix参数可以实现该功能
dummies=pd.get_dummies(df['key'],prefix='key')
df_with_dummy=df[['data1']].join(dummies)
df_with_dummy
#给指标DataFrame的列加上一个前缀,以便能够跟其他数据进行合并。get_dummies的prefix参数可以实现该功能
dummies=pd.get_dummies(df['key'],prefix='key')
df_with_dummy=df[['data1']].join(dummies)
df_with_dummy
Out[65]:
data1 key_a key_b key_c
0 0 0 1 0
1 1 0 1 0
2 2 1 0 0
3 3 0 0 1
4 4 1 0 0
5 5 0 1 0
字符串操作