import numpy as np
import pandas as pd

#DataFrame创建
#1.通过字典创建三行两列,使用默认索引
d = {"code":[1,2,3],"name":['zhangsan','lisi','wangwu']}
pd.DataFrame(data=d)
# Out[162]: 
#    code      name
# 0     1  zhangsan
# 1     2      lisi
# 2     3    wangwu

#2.通过字典创建两行两列,指定name作为索引,socre为得分列
d = {"score":[1,2,3],"name":['zhangsan','lisi','wangwu']}
pd.DataFrame(data=d['score'],index=[d['name']])
# Out[163]: 
#           0
# zhangsan  1
# lisi      2
# wangwu    3


#3.通过Series创建
#Series中的索引直接作为pd的索引
sr1 = pd.Series(['zhangsan','lisi','wangwu'],index=['a','b','c'])
sr2 = pd.Series([4,5,6],index=['b','c','a'])
pd.DataFrame({'one':sr1,'two':sr2})
# Out[164]: 
#         one  two
# a  zhangsan    6
# b      lisi    4
# c    wangwu    5

#4.手动创建空的DataFrame,指定数据列
pd.DataFrame(columns=['name','score'])
# Out[165]: 
# Empty DataFrame
# Columns: [name, score]
# Index: []


#5.手动创建空的DataFrame,指定数据列,指定索引
pd.DataFrame(columns=['name','score'],index=['id'])
# Out[166]: 
#    name score
# id  NaN   NaN

#5.手动创建空的DataFrame,指定数据列,指定索引
pd.DataFrame(columns=['name','score'],index=['id'])
# Out[167]: 
#    name score
# id  NaN   NaN

#pd.date_range
#6.指定起止日期的索引序列
pd.date_range('20210201','20210210')
# Out[168]: 
# DatetimeIndex(['2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04',
#                '2021-02-05', '2021-02-06', '2021-02-07', '2021-02-08',
#                '2021-02-09', '2021-02-10'],
#               dtype='datetime64[ns]', freq='D')

#7.指定开始日期和日期数量为10个的索引
pd.date_range('20210201',periods=10)
# Out[169]: 
# DatetimeIndex(['2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04',
#                '2021-02-05', '2021-02-06', '2021-02-07', '2021-02-08',
#                '2021-02-09', '2021-02-10'],
#               dtype='datetime64[ns]', freq='D')


#8.指定结束日期和日期序列数量
pd.date_range(end='20210201',periods=10)
# DatetimeIndex(['2021-01-23', '2021-01-24', '2021-01-25', '2021-01-26',
#                '2021-01-27', '2021-01-28', '2021-01-29', '2021-01-30',
#                '2021-01-31', '2021-02-01'],
#               dtype='datetime64[ns]', freq='D')


#9.指定开始日期、日期序列数量和日期间隔频率
#这里指定开始日期和数量,同时按照3d(三天)的的间隔来生成日期
pd.date_range(start='20210220',periods=15,freq='3D')
# DatetimeIndex(['2021-02-20', '2021-02-23', '2021-02-26', '2021-03-01',
#                '2021-03-04', '2021-03-07', '2021-03-10', '2021-03-13',
#                '2021-03-16', '2021-03-19', '2021-03-22', '2021-03-25',
#                '2021-03-28', '2021-03-31', '2021-04-03'],
#               dtype='datetime64[ns]', freq='3D')


#10.指定结束日期、日期序列数量和频率
pd.date_range(end='20210220',periods=15,freq='2D')
# Out[172]: 
# DatetimeIndex(['2021-01-23', '2021-01-25', '2021-01-27', '2021-01-29',
#                '2021-01-31', '2021-02-02', '2021-02-04', '2021-02-06',
#                '2021-02-08', '2021-02-10', '2021-02-12', '2021-02-14',
#                '2021-02-16', '2021-02-18', '2021-02-20'],
#               dtype='datetime64[ns]', freq='2D')

#11.指定起止日期和inclusive参数,inclusive表示闭区间,这里是左闭右开
#inclusive的枚举值:{“both”, “neither”, “left”, “right”}, default “both”
pd.date_range('20210201','20210220',inclusive='left')
# Out[173]: 
# DatetimeIndex(['2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04',
#                '2021-02-05', '2021-02-06', '2021-02-07', '2021-02-08',
#                '2021-02-09', '2021-02-10', '2021-02-11', '2021-02-12',
#                '2021-02-13', '2021-02-14', '2021-02-15', '2021-02-16',
#                '2021-02-17', '2021-02-18', '2021-02-19'],
#               dtype='datetime64[ns]', freq='D')


#12.指定起止日期和inclusive参数,inclusive表示闭区间,这里是左开右闭
pd.date_range('20210201','20210220',inclusive='right')
# Out[174]: 
# DatetimeIndex(['2021-02-02', '2021-02-03', '2021-02-04', '2021-02-05',
#                '2021-02-06', '2021-02-07', '2021-02-08', '2021-02-09',
#                '2021-02-10', '2021-02-11', '2021-02-12', '2021-02-13',
#                '2021-02-14', '2021-02-15', '2021-02-16', '2021-02-17',
#                '2021-02-18', '2021-02-19', '2021-02-20'],
#               dtype='datetime64[ns]', freq='D')


#13.时间序列做为Series的索引
dates = pd.date_range(start='20200101',periods=5,freq='1D')
pd.Series(range(10,20,2),index=dates)
# Out[175]: 
# 2020-01-01    10
# 2020-01-02    12
# 2020-01-03    14
# 2020-01-04    16
# 2020-01-05    18
# Freq: D, dtype: int64

#14.时间序列做行索引,生成DateFrame二维数组
dates = pd.date_range(start='20200101',periods=5,freq='2D')
pd.DataFrame(data=np.random.randn(5,5), index=dates,columns=list('ABCDE'))
# Out[176]: 
#                    A         B         C         D         E
# 2020-01-01  0.322104 -0.632878  0.455840 -0.400103  0.067897
# 2020-01-03 -0.416713  1.896914  1.278218 -1.405646  0.966367
# 2020-01-05 -0.914046  0.370047  0.057120  0.197381 -0.772321
# 2020-01-07  0.293243  1.451900  0.762968 -0.212332  1.128985
# 2020-01-09 -0.498262  0.388120  1.197879 -1.363563  0.720579


#15.获取DataFrame的索引信息
dates = pd.date_range(start='20200101',periods=5,freq='1D')
df=pd.DataFrame(data=np.random.randn(5,5), index=dates,columns=list('ABCDE'))
df.index
# Out[177]: 
# DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
#                '2020-01-05'],
#               dtype='datetime64[ns]', freq='D')

#16.获取指定索引行,所有列的数据
d = np.random.randn(5,5)
dates = pd.date_range(start='20200101',periods=5,freq='1D')
df=pd.DataFrame(data=d, index=dates,columns=list('ABCDE'))
df.loc['2020-01-01',:]
# Out[178]: 
# A    0.352445
# B    0.851941
# C   -0.149310
# D   -0.360615
# E   -0.838793
# Name: 2020-01-01 00:00:00, dtype: float64


#与16等价
df=pd.DataFrame(data=d, index=dates,columns=list('ABCDE'))
df.loc['2020-01-01']
# Out[179]: 
# A    0.352445
# B    0.851941
# C   -0.149310
# D   -0.360615
# E   -0.838793
# Name: 2020-01-01 00:00:00, dtype: float64

#17.获取指定行指定列的数据
df=pd.DataFrame(data=d, index=dates,columns=list('ABCDE'))
df.loc['2020-01-01','A']
df.loc['2020-01-01',['A','C']]
# Out[180]: 
# A    0.352445
# C   -0.149310
# Name: 2020-01-01 00:00:00, dtype: float64

#18.获取前两行,后两列的数据
df.iloc[:2,-2:]
# Out[181]: 
#                    D         E
# 2020-01-01 -0.360615 -0.838793
# 2020-01-02  0.621101 -1.155256

#18.获取第两行第三列的数据
df
df.iloc[1,2]
# Out[182]: 0.06651359417422138

#19.两个DataFrame相加,索引对齐
df1 = pd.DataFrame({'one':[1,2,3,4],'two':[5,6,7,8]},index=['a','b','c','d'])
df2 = pd.DataFrame({'two':[3,5,7,9],'one':[2,4,6,8]},index=['b','a','d','c'])
df1+df2
# Out[183]: 
#    one  two
# a    5   10
# b    4    9
# c   11   16
# d   10   15

#20.获列字段的元数据信息
df1 = pd.DataFrame({'one':[1,2,3,4],'two':[5,6,7,8]},index=['a','b','c','d'])
df1.columns
# Out[184]: Index(['one', 'two'], dtype='object')

#21.获索引行字段的元数据信息
df1 = pd.DataFrame({'one':[1,2,3,4],'two':[5,6,7,8]},index=['a','b','c','d'])
df1.index
# Out[185]: Index(['a', 'b', 'c', 'd'], dtype='object')

#22.数据缺失值填充
df1 = pd.DataFrame({'one':[1,2,np.nan,4],'two':[5,6,7,np.nan]},index=['a','b','c','d'])
df1
df1.fillna('-1')#缺失值填充成-1
# Out[186]: 
#    one  two
# a  1.0  5.0
# b  2.0  6.0
# c   -1  7.0
# d  4.0   -1

#23.删除缺失值数据
df1 = pd.DataFrame({'one':[1,2,np.nan,4],'two':[5,6,7,np.nan]},index=['a','b','c','d'])
df1
df1.dropna() #只要有缺失值,基相关的行和列数据都会删除
# Out[187]: 
#    one  two
# a  1.0  5.0
# b  2.0  6.0

#通过布尔表达式进行数据筛选
#24.布尔表达式-数值型过滤
df1 = pd.DataFrame({'one':[1,2,np.nan,4],'two':[5,6,7,np.nan]},index=['a','b','c','d'])
df1
df1[df1['one'].isnull()] #找出one列为nan的数据
# Out[189]: 
#    one  two
# c  NaN  7.0

df1[df1['one'].notnull()] #找出one列不为nan的数据
# Out[190]: 
#    one  two
# a  1.0  5.0
# b  2.0  6.0
# d  4.0  NaN

df1[df1['one']>2] #找出one列大于2的数据
# Out[188]: 
#    one  two
# d  4.0  NaN

#25.布尔表达式-日期型过滤
dates = pd.date_range(start='20210301',periods=5,freq='1D',name='d')
df=pd.DataFrame(data=np.random.randn(5,5), index=dates,columns=list('ABCDE'))
df[df.index > '2021-03-01'] #过滤掉索引中大于0301的数据
# Out[191]: 
#                    A         B         C         D         E
# d                                                           
# 2021-03-02 -1.575225 -0.107793  0.549712 -0.811645  0.017544
# 2021-03-03 -0.193079 -0.192127  0.357715  1.182692 -0.889554
# 2021-03-04  1.258205 -0.323691  0.536715 -1.048818 -0.407462
# 2021-03-05 -1.436989  0.262423  0.961729 -0.614666 -0.086917

df[df.index != '2021-03-01'] #过滤掉索引中大于0301的数据
# Out[192]: 
#                    A         B         C         D         E
# d                                                           
# 2021-03-02 -1.575225 -0.107793  0.549712 -0.811645  0.017544
# 2021-03-03 -0.193079 -0.192127  0.357715  1.182692 -0.889554
# 2021-03-04  1.258205 -0.323691  0.536715 -1.048818 -0.407462
# 2021-03-05 -1.436989  0.262423  0.961729 -0.614666 -0.086917

#26.DataFrame增加数据行
df1=pd.DataFrame(columns=['name','score'])
df1.loc[0] = ['zhangsan',20] #对索引行0增加数据,数据该行有值则替换
df1
# Out[194]: 
#        name  score
# 0  zhangsan     20

df1.loc[len(df1.index)] = ['lisi',30] #对索引行0增加数据,数据该行有值则替换
df1
# Out[196]: 
#        name  score
# 0  zhangsan     20
# 1      lisi     30

#27.DataFrame增加数据列
df1=pd.DataFrame(columns=['name','score'])
df1.loc[0] = ['zhangsan',20] #对索引行0增加数据,数据该行有值则替换
df1
# Out[197]: 
#        name  score
# 0  zhangsan     20

df1['sex'] = ''
df1
# Out[199]: 
#        name  score sex
# 0  zhangsan     20

#DataFrame索引分层
#28.将已有DataFrame做索引进行分层
df = pd.DataFrame(columns=['date','key','score','flag'])
df.loc[len(df.index)] = ['20210101','10203',100,1]
df.loc[len(df.index)] = ['20210101','10204',200,1]
df
# Out[200]: 
#        date    key  score  flag
# 0  20210101  10203    100     1
# 1  20210101  10204    200     1


df2 = df.set_index( [ 'date','key' ]  )  #设置索引为两个
df2
# Out[201]: 
#                 score  flag
# date     key               
# 20210101 10203    100     1
#          10204    200     1

#29.直接定义一个索引分层的DataFrame
df = pd.DataFrame(columns=['score','flag'],index=[['date'],['key']])
df
# Out[202]: 
#          score flag
# date key   NaN  NaN

#30.向索引分层的DataFrame写入数据
df = pd.DataFrame(columns=['score','flag'],index=[['date'],['key']])
df.loc[('20210102','10204'),:] = [100,10]
df.loc[('20210102','10203'),:] = [100,20]
df.loc[('20210103','10203'),:] = [100,30]
df
# Out[203]: 
#                score flag
# date     key     NaN  NaN
# 20210102 10204   100   10
#          10203   100   20
# 20210103 10203   100   30

#31.索引分层的DataFrame使用
df = pd.DataFrame(columns=['score','flag'],index=[['date'],['key']])
df.loc[('20210101','10104'),:] = [100,10]
df.loc[('20210102','10204'),:] = [100,10]
df.loc[('20210102','10203'),:] = [100,20]
df.loc[('20210103','10303'),:] = [100,30]

df.loc[[('20210103')],:] #按索引过滤出数据
# Out[204]: 
#                score flag
# 20210103 10303   100   30

df.loc[[('20210102','10203')],:] #按索引过滤出数据
# Out[205]: 
#                score flag
# 20210102 10203   100   20

#32.判断索引是否存在
('20210103','10303') in df.index
# Out[206]: True

('20210103','10503') in df.index
# Out[207]: False