import numpy as np
import pandas as pd
#DataFrame创建
#1.通过字典创建三行两列,使用默认索引
d = {"code":[1,2,3],"name":['zhangsan','lisi','wangwu']}
pd.DataFrame(data=d)
# Out[162]:
# code name
# 0 1 zhangsan
# 1 2 lisi
# 2 3 wangwu
#2.通过字典创建两行两列,指定name作为索引,socre为得分列
d = {"score":[1,2,3],"name":['zhangsan','lisi','wangwu']}
pd.DataFrame(data=d['score'],index=[d['name']])
# Out[163]:
# 0
# zhangsan 1
# lisi 2
# wangwu 3
#3.通过Series创建
#Series中的索引直接作为pd的索引
sr1 = pd.Series(['zhangsan','lisi','wangwu'],index=['a','b','c'])
sr2 = pd.Series([4,5,6],index=['b','c','a'])
pd.DataFrame({'one':sr1,'two':sr2})
# Out[164]:
# one two
# a zhangsan 6
# b lisi 4
# c wangwu 5
#4.手动创建空的DataFrame,指定数据列
pd.DataFrame(columns=['name','score'])
# Out[165]:
# Empty DataFrame
# Columns: [name, score]
# Index: []
#5.手动创建空的DataFrame,指定数据列,指定索引
pd.DataFrame(columns=['name','score'],index=['id'])
# Out[166]:
# name score
# id NaN NaN
#5.手动创建空的DataFrame,指定数据列,指定索引
pd.DataFrame(columns=['name','score'],index=['id'])
# Out[167]:
# name score
# id NaN NaN
#pd.date_range
#6.指定起止日期的索引序列
pd.date_range('20210201','20210210')
# Out[168]:
# DatetimeIndex(['2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04',
# '2021-02-05', '2021-02-06', '2021-02-07', '2021-02-08',
# '2021-02-09', '2021-02-10'],
# dtype='datetime64[ns]', freq='D')
#7.指定开始日期和日期数量为10个的索引
pd.date_range('20210201',periods=10)
# Out[169]:
# DatetimeIndex(['2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04',
# '2021-02-05', '2021-02-06', '2021-02-07', '2021-02-08',
# '2021-02-09', '2021-02-10'],
# dtype='datetime64[ns]', freq='D')
#8.指定结束日期和日期序列数量
pd.date_range(end='20210201',periods=10)
# DatetimeIndex(['2021-01-23', '2021-01-24', '2021-01-25', '2021-01-26',
# '2021-01-27', '2021-01-28', '2021-01-29', '2021-01-30',
# '2021-01-31', '2021-02-01'],
# dtype='datetime64[ns]', freq='D')
#9.指定开始日期、日期序列数量和日期间隔频率
#这里指定开始日期和数量,同时按照3d(三天)的的间隔来生成日期
pd.date_range(start='20210220',periods=15,freq='3D')
# DatetimeIndex(['2021-02-20', '2021-02-23', '2021-02-26', '2021-03-01',
# '2021-03-04', '2021-03-07', '2021-03-10', '2021-03-13',
# '2021-03-16', '2021-03-19', '2021-03-22', '2021-03-25',
# '2021-03-28', '2021-03-31', '2021-04-03'],
# dtype='datetime64[ns]', freq='3D')
#10.指定结束日期、日期序列数量和频率
pd.date_range(end='20210220',periods=15,freq='2D')
# Out[172]:
# DatetimeIndex(['2021-01-23', '2021-01-25', '2021-01-27', '2021-01-29',
# '2021-01-31', '2021-02-02', '2021-02-04', '2021-02-06',
# '2021-02-08', '2021-02-10', '2021-02-12', '2021-02-14',
# '2021-02-16', '2021-02-18', '2021-02-20'],
# dtype='datetime64[ns]', freq='2D')
#11.指定起止日期和inclusive参数,inclusive表示闭区间,这里是左闭右开
#inclusive的枚举值:{“both”, “neither”, “left”, “right”}, default “both”
pd.date_range('20210201','20210220',inclusive='left')
# Out[173]:
# DatetimeIndex(['2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04',
# '2021-02-05', '2021-02-06', '2021-02-07', '2021-02-08',
# '2021-02-09', '2021-02-10', '2021-02-11', '2021-02-12',
# '2021-02-13', '2021-02-14', '2021-02-15', '2021-02-16',
# '2021-02-17', '2021-02-18', '2021-02-19'],
# dtype='datetime64[ns]', freq='D')
#12.指定起止日期和inclusive参数,inclusive表示闭区间,这里是左开右闭
pd.date_range('20210201','20210220',inclusive='right')
# Out[174]:
# DatetimeIndex(['2021-02-02', '2021-02-03', '2021-02-04', '2021-02-05',
# '2021-02-06', '2021-02-07', '2021-02-08', '2021-02-09',
# '2021-02-10', '2021-02-11', '2021-02-12', '2021-02-13',
# '2021-02-14', '2021-02-15', '2021-02-16', '2021-02-17',
# '2021-02-18', '2021-02-19', '2021-02-20'],
# dtype='datetime64[ns]', freq='D')
#13.时间序列做为Series的索引
dates = pd.date_range(start='20200101',periods=5,freq='1D')
pd.Series(range(10,20,2),index=dates)
# Out[175]:
# 2020-01-01 10
# 2020-01-02 12
# 2020-01-03 14
# 2020-01-04 16
# 2020-01-05 18
# Freq: D, dtype: int64
#14.时间序列做行索引,生成DateFrame二维数组
dates = pd.date_range(start='20200101',periods=5,freq='2D')
pd.DataFrame(data=np.random.randn(5,5), index=dates,columns=list('ABCDE'))
# Out[176]:
# A B C D E
# 2020-01-01 0.322104 -0.632878 0.455840 -0.400103 0.067897
# 2020-01-03 -0.416713 1.896914 1.278218 -1.405646 0.966367
# 2020-01-05 -0.914046 0.370047 0.057120 0.197381 -0.772321
# 2020-01-07 0.293243 1.451900 0.762968 -0.212332 1.128985
# 2020-01-09 -0.498262 0.388120 1.197879 -1.363563 0.720579
#15.获取DataFrame的索引信息
dates = pd.date_range(start='20200101',periods=5,freq='1D')
df=pd.DataFrame(data=np.random.randn(5,5), index=dates,columns=list('ABCDE'))
df.index
# Out[177]:
# DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
# '2020-01-05'],
# dtype='datetime64[ns]', freq='D')
#16.获取指定索引行,所有列的数据
d = np.random.randn(5,5)
dates = pd.date_range(start='20200101',periods=5,freq='1D')
df=pd.DataFrame(data=d, index=dates,columns=list('ABCDE'))
df.loc['2020-01-01',:]
# Out[178]:
# A 0.352445
# B 0.851941
# C -0.149310
# D -0.360615
# E -0.838793
# Name: 2020-01-01 00:00:00, dtype: float64
#与16等价
df=pd.DataFrame(data=d, index=dates,columns=list('ABCDE'))
df.loc['2020-01-01']
# Out[179]:
# A 0.352445
# B 0.851941
# C -0.149310
# D -0.360615
# E -0.838793
# Name: 2020-01-01 00:00:00, dtype: float64
#17.获取指定行指定列的数据
df=pd.DataFrame(data=d, index=dates,columns=list('ABCDE'))
df.loc['2020-01-01','A']
df.loc['2020-01-01',['A','C']]
# Out[180]:
# A 0.352445
# C -0.149310
# Name: 2020-01-01 00:00:00, dtype: float64
#18.获取前两行,后两列的数据
df.iloc[:2,-2:]
# Out[181]:
# D E
# 2020-01-01 -0.360615 -0.838793
# 2020-01-02 0.621101 -1.155256
#18.获取第两行第三列的数据
df
df.iloc[1,2]
# Out[182]: 0.06651359417422138
#19.两个DataFrame相加,索引对齐
df1 = pd.DataFrame({'one':[1,2,3,4],'two':[5,6,7,8]},index=['a','b','c','d'])
df2 = pd.DataFrame({'two':[3,5,7,9],'one':[2,4,6,8]},index=['b','a','d','c'])
df1+df2
# Out[183]:
# one two
# a 5 10
# b 4 9
# c 11 16
# d 10 15
#20.获列字段的元数据信息
df1 = pd.DataFrame({'one':[1,2,3,4],'two':[5,6,7,8]},index=['a','b','c','d'])
df1.columns
# Out[184]: Index(['one', 'two'], dtype='object')
#21.获索引行字段的元数据信息
df1 = pd.DataFrame({'one':[1,2,3,4],'two':[5,6,7,8]},index=['a','b','c','d'])
df1.index
# Out[185]: Index(['a', 'b', 'c', 'd'], dtype='object')
#22.数据缺失值填充
df1 = pd.DataFrame({'one':[1,2,np.nan,4],'two':[5,6,7,np.nan]},index=['a','b','c','d'])
df1
df1.fillna('-1')#缺失值填充成-1
# Out[186]:
# one two
# a 1.0 5.0
# b 2.0 6.0
# c -1 7.0
# d 4.0 -1
#23.删除缺失值数据
df1 = pd.DataFrame({'one':[1,2,np.nan,4],'two':[5,6,7,np.nan]},index=['a','b','c','d'])
df1
df1.dropna() #只要有缺失值,基相关的行和列数据都会删除
# Out[187]:
# one two
# a 1.0 5.0
# b 2.0 6.0
#通过布尔表达式进行数据筛选
#24.布尔表达式-数值型过滤
df1 = pd.DataFrame({'one':[1,2,np.nan,4],'two':[5,6,7,np.nan]},index=['a','b','c','d'])
df1
df1[df1['one'].isnull()] #找出one列为nan的数据
# Out[189]:
# one two
# c NaN 7.0
df1[df1['one'].notnull()] #找出one列不为nan的数据
# Out[190]:
# one two
# a 1.0 5.0
# b 2.0 6.0
# d 4.0 NaN
df1[df1['one']>2] #找出one列大于2的数据
# Out[188]:
# one two
# d 4.0 NaN
#25.布尔表达式-日期型过滤
dates = pd.date_range(start='20210301',periods=5,freq='1D',name='d')
df=pd.DataFrame(data=np.random.randn(5,5), index=dates,columns=list('ABCDE'))
df[df.index > '2021-03-01'] #过滤掉索引中大于0301的数据
# Out[191]:
# A B C D E
# d
# 2021-03-02 -1.575225 -0.107793 0.549712 -0.811645 0.017544
# 2021-03-03 -0.193079 -0.192127 0.357715 1.182692 -0.889554
# 2021-03-04 1.258205 -0.323691 0.536715 -1.048818 -0.407462
# 2021-03-05 -1.436989 0.262423 0.961729 -0.614666 -0.086917
df[df.index != '2021-03-01'] #过滤掉索引中大于0301的数据
# Out[192]:
# A B C D E
# d
# 2021-03-02 -1.575225 -0.107793 0.549712 -0.811645 0.017544
# 2021-03-03 -0.193079 -0.192127 0.357715 1.182692 -0.889554
# 2021-03-04 1.258205 -0.323691 0.536715 -1.048818 -0.407462
# 2021-03-05 -1.436989 0.262423 0.961729 -0.614666 -0.086917
#26.DataFrame增加数据行
df1=pd.DataFrame(columns=['name','score'])
df1.loc[0] = ['zhangsan',20] #对索引行0增加数据,数据该行有值则替换
df1
# Out[194]:
# name score
# 0 zhangsan 20
df1.loc[len(df1.index)] = ['lisi',30] #对索引行0增加数据,数据该行有值则替换
df1
# Out[196]:
# name score
# 0 zhangsan 20
# 1 lisi 30
#27.DataFrame增加数据列
df1=pd.DataFrame(columns=['name','score'])
df1.loc[0] = ['zhangsan',20] #对索引行0增加数据,数据该行有值则替换
df1
# Out[197]:
# name score
# 0 zhangsan 20
df1['sex'] = ''
df1
# Out[199]:
# name score sex
# 0 zhangsan 20
#DataFrame索引分层
#28.将已有DataFrame做索引进行分层
df = pd.DataFrame(columns=['date','key','score','flag'])
df.loc[len(df.index)] = ['20210101','10203',100,1]
df.loc[len(df.index)] = ['20210101','10204',200,1]
df
# Out[200]:
# date key score flag
# 0 20210101 10203 100 1
# 1 20210101 10204 200 1
df2 = df.set_index( [ 'date','key' ] ) #设置索引为两个
df2
# Out[201]:
# score flag
# date key
# 20210101 10203 100 1
# 10204 200 1
#29.直接定义一个索引分层的DataFrame
df = pd.DataFrame(columns=['score','flag'],index=[['date'],['key']])
df
# Out[202]:
# score flag
# date key NaN NaN
#30.向索引分层的DataFrame写入数据
df = pd.DataFrame(columns=['score','flag'],index=[['date'],['key']])
df.loc[('20210102','10204'),:] = [100,10]
df.loc[('20210102','10203'),:] = [100,20]
df.loc[('20210103','10203'),:] = [100,30]
df
# Out[203]:
# score flag
# date key NaN NaN
# 20210102 10204 100 10
# 10203 100 20
# 20210103 10203 100 30
#31.索引分层的DataFrame使用
df = pd.DataFrame(columns=['score','flag'],index=[['date'],['key']])
df.loc[('20210101','10104'),:] = [100,10]
df.loc[('20210102','10204'),:] = [100,10]
df.loc[('20210102','10203'),:] = [100,20]
df.loc[('20210103','10303'),:] = [100,30]
df.loc[[('20210103')],:] #按索引过滤出数据
# Out[204]:
# score flag
# 20210103 10303 100 30
df.loc[[('20210102','10203')],:] #按索引过滤出数据
# Out[205]:
# score flag
# 20210102 10203 100 20
#32.判断索引是否存在
('20210103','10303') in df.index
# Out[206]: True
('20210103','10503') in df.index
# Out[207]: False