1)移除重复数据(duplicated)
2)利用函数或映射进行数据转换(map)
3)替换值(replace)
4)重命名轴索引
5)检测和过滤异常值(逻辑索引)
6)随机采样或选出随机子集
7)计算哑变量(get_dummies)
检查某列数据是否重复可用.is_unique
检查某行数据是否重复可用.duplicated
import pandas as pd
import numpy as np
data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4,'k2':[1,1,2,3,3,4,4]})
data
| k1 | k2 |
0 | one | 1 |
1 | one | 1 |
2 | one | 2 |
3 | two | 3 |
4 | two | 3 |
5 | two | 4 |
6 | two | 4 |
data.index.is_unique#检查列
True
data.k1.is_unique#检查列
False
data['k2'].is_unique#检查列
False
data.is_unique
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-12-3c5fb82b7563> in <module>()
----> 1 data.is_unique
C:\Program Files\anaconda\lib\site-packages\pandas\core\generic.pyc in __getattr__(self, name)
2670 if name in self._info_axis:
2671 return self[name]
-> 2672 return object.__getattribute__(self, name)
2673
2674 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'is_unique'
data.duplicated()#检查行
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
data.drop_duplicates()
| k1 | k2 |
0 | one | 1 |
2 | one | 2 |
3 | two | 3 |
5 | two | 4 |
set(data.k1)#保留唯一的列属性值
{'one', 'two'}
1) drop_duplicates、duolicated函数只能用于DataFrame
2) is_unique不能用于DataFrame
data1 = pd.DataFrame({'food':['bacon','pork','bacon','Pastrami',\
'beef','Bacon','pastrami','ham','lox'],\
'ounces':[4,3,12,6,7.5,8,3,5,6]})
data1
| food | ounces |
0 | bacon | 4.0 |
1 | pork | 3.0 |
2 | bacon | 12.0 |
3 | Pastrami | 6.0 |
4 | beef | 7.5 |
5 | Bacon | 8.0 |
6 | pastrami | 3.0 |
7 | ham | 5.0 |
8 | lox | 6.0 |
#step1:构建肉类到动物的映射
meat_to_animal = {'bacon':'pig','pork':'pig','pastrami':'cow','beef':'cow','ham':'pig',\
'lox':'salmon'}
Series的map方法可以接受一个函数或含有映射关系的字典型对象,字符的大小写要一致
#step2:映射
data1['animal'] = data1['food'].map(str.lower).map(meat_to_animal)
data1
| food | ounces | animal |
0 | bacon | 4.0 | pig |
1 | pork | 3.0 | pig |
2 | bacon | 12.0 | pig |
3 | Pastrami | 6.0 | cow |
4 | beef | 7.5 | cow |
5 | Bacon | 8.0 | pig |
6 | pastrami | 3.0 | cow |
7 | ham | 5.0 | pig |
8 | lox | 6.0 | salmon |
#step2的另一种实现方法
data1['food'].map(lambda x:meat_to_animal[x.lower()])
0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype: object
data1
| food | ounces | animal |
0 | bacon | 4.0 | pig |
1 | pork | 3.0 | pig |
2 | bacon | 12.0 | pig |
3 | Pastrami | 6.0 | cow |
4 | beef | 7.5 | cow |
5 | Bacon | 8.0 | pig |
6 | pastrami | 3.0 | cow |
7 | ham | 5.0 | pig |
8 | lox | 6.0 | salmon |
map是一种实现元素级转换记忆其他数据清理工作的便捷方式
替换缺失值的方法:
1)fillna
2)含有重复索引的合并combine_first
3)replace
data2 = pd.Series([1.,-999,2,-999,-1000,3.])
data2
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
-999可能是一个表示缺失数据的标记值,要将其替换为pandas能够理解的NA值,可以利用replace
data2.replace(-999,np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
data2
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
replace不改变原数据集
data2.replace([-999,-1000],np.nan)#一次传入一个列表即可
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
data2.replace([-999,-1000],[np.nan,0])
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
data2.replace({-999:np.nan,-1000:0})
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
跟Series中的值一样,轴标签也可以通过函数或映射进行转换,从而得到一个新对象,轴还可以被就地修改,而无需新建一个数据结构
data3 = pd.DataFrame(np.arange(12).reshape(3,4),index = ['a','b','c'],columns = ['one','two','three','four'])
data3
| one | two | three | four |
a | 0 | 1 | 2 | 3 |
b | 4 | 5 | 6 | 7 |
c | 8 | 9 | 10 | 11 |
data3.index.map(str.upper)
array(['A', 'B', 'C'], dtype=object)
data3
| one | two | three | four |
a | 0 | 1 | 2 | 3 |
b | 4 | 5 | 6 | 7 |
c | 8 | 9 | 10 | 11 |
data3.index = data3.index.map(str.upper)#修改了
data3
| one | two | three | four |
A | 0 | 1 | 2 | 3 |
B | 4 | 5 | 6 | 7 |
C | 8 | 9 | 10 | 11 |
还可以通过rename结合字典型对象实现对部分轴标签的更新
data3.rename(index = {'A':'aaa'},columns = {'three':'liu'})
| one | two | liu | four |
aaa | 0 | 1 | 2 | 3 |
B | 4 | 5 | 6 | 7 |
C | 8 | 9 | 10 | 11 |
data3#不改变原数据
| one | two | three | four |
A | 0 | 1 | 2 | 3 |
B | 4 | 5 | 6 | 7 |
C | 8 | 9 | 10 | 11 |
data3 = data3.rename(index = {'A':'aaa'},columns = {'three':'liu'})
data3
| one | two | liu | four |
aaa | 0 | 1 | 2 | 3 |
B | 4 | 5 | 6 | 7 |
C | 8 | 9 | 10 | 11 |
这里的异常值的阈值已知,因此,异常值的过滤或变换运算很大程度上其实就是逻辑数组运算。
data4 = pd.DataFrame(np.random.randn(1000,4))
data4.info()
data4.describe()
| 0 | 1 | 2 | 3 |
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | 0.023986 | -0.014049 | 0.032299 | -0.037661 |
std | 0.994571 | 1.003522 | 1.009939 | 1.017361 |
min | -3.526970 | -3.298974 | -3.429383 | -3.421995 |
25% | -0.632426 | -0.685564 | -0.665548 | -0.756219 |
50% | 0.013326 | 0.006130 | -0.017911 | -0.015297 |
75% | 0.633279 | 0.670261 | 0.673849 | 0.665360 |
max | 3.549620 | 3.142503 | 3.991028 | 3.086376 |
data4[3][np.abs(data4[3]) > 3]
189 -3.421995
335 3.086376
590 -3.388477
778 -3.100379
Name: 3, dtype: float64
(np.abs(data4) > 3).any(1).head()
0 False
1 False
2 False
3 False
4 False
dtype: bool
data4[(np.abs(data4) > 3).any(1)]
| 0 | 1 | 2 | 3 |
109 | 3.549620 | -0.943976 | -0.058490 | 0.941503 |
189 | -0.071249 | -1.350361 | 0.385375 | -3.421995 |
291 | 2.337961 | 3.142503 | -0.208999 | -0.485979 |
335 | 0.230998 | -1.397259 | 2.734229 | 3.086376 |
447 | -3.526970 | -0.289467 | 1.099487 | 1.206039 |
464 | 0.011728 | -0.398739 | 3.104470 | 0.459924 |
546 | 0.357944 | 0.007063 | 3.991028 | 0.722481 |
573 | -3.019947 | -0.982651 | -1.727289 | 1.484966 |
590 | 0.211069 | 0.344059 | 0.656351 | -3.388477 |
660 | 0.930103 | 3.117643 | -1.372034 | -1.208730 |
663 | 0.362668 | -3.298974 | -1.033128 | 0.900985 |
778 | 0.094172 | 0.827937 | 2.617724 | -3.100379 |
814 | -1.450645 | -1.131513 | -3.429383 | -0.828139 |
853 | 1.188536 | -3.069987 | -0.746700 | 0.745037 |
899 | 2.449030 | 0.429959 | 3.025705 | -1.571179 |
data4[np.abs(data4) > 3] = np.sign(data) * 3
data4.isnull().sum()#有空值
0 3
1 4
2 4
3 4
dtype: int64
data4 = data4.replace(np.nan,0)
data4.isnull().sum()#无空值
0 0
1 0
2 0
3 0
dtype: int64
data4.describe()###?????????
| 0 | 1 | 2 | 3 |
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | 0.026983 | -0.013941 | 0.025608 | -0.030836 |
std | 0.977152 | 0.983421 | 0.986831 | 0.996554 |
min | -2.749595 | -2.799638 | -2.943564 | -2.743207 |
25% | -0.630318 | -0.682237 | -0.663014 | -0.739291 |
50% | 0.012445 | 0.000613 | -0.017171 | -0.004484 |
75% | 0.631146 | 0.668023 | 0.660236 | 0.659204 |
max | 2.829804 | 2.915031 | 2.907655 | 2.679495 |
1)numpy.random.permutation函数
2)np.random.randint生成随机数
df = pd.DataFrame(np.arange(5 *4).reshape(5,4))
sampler = np.random.permutation(5)
df
| 0 | 1 | 2 | 3 |
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
sampler
array([0, 1, 3, 4, 2])
df.ix[sampler]
| 0 | 1 | 2 | 3 |
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
2 | 8 | 9 | 10 | 11 |
df.take(sampler)
| 0 | 1 | 2 | 3 |
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
2 | 8 | 9 | 10 | 11 |
通过np.random.randint得到一组随机整数
sampler1 = np.random.randint(0,len(df),size = 4)
sampler1
array([2, 2, 3, 0])
df1 = df.take(sampler1)
1
df1
| 0 | 1 | 2 | 3 |
2 | 8 | 9 | 10 | 11 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
0 | 0 | 1 | 2 | 3 |
7)计算指标/哑变量
将分类变量(categorical variable)转换为(哑变量矩阵,dummy matrix)或(指标矩阵,indicator matrix)是常用于统计学习建模或机器学习的转换方式。
即 DataFrame的某一列中含有k个不同的值,则可以派生出一个k列矩阵或DataFrame(其值为1或0)。
pandas中的get_dummies函数可以实现以上功能
df2 = pd.DataFrame({'key':['b','a','b','c','a','b'],'data1':range(6)})
df2
| data1 | key |
0 | 0 | b |
1 | 1 | a |
2 | 2 | b |
3 | 3 | c |
4 | 4 | a |
5 | 5 | b |
pd.get_dummies(df2.key)
| a | b | c |
0 | 0.0 | 1.0 | 0.0 |
1 | 1.0 | 0.0 | 0.0 |
2 | 0.0 | 1.0 | 0.0 |
3 | 0.0 | 0.0 | 1.0 |
4 | 1.0 | 0.0 | 0.0 |
5 | 0.0 | 1.0 | 0.0 |
pd.get_dummies(df2['key'],prefix = 'key')
| key_a | key_b | key_c |
0 | 0.0 | 1.0 | 0.0 |
1 | 1.0 | 0.0 | 0.0 |
2 | 0.0 | 1.0 | 0.0 |
3 | 0.0 | 0.0 | 1.0 |
4 | 1.0 | 0.0 | 0.0 |
5 | 0.0 | 1.0 | 0.0 |
## get_dummies矩阵和原数据连接
dummies = pd.get_dummies(df2['key'],prefix = 'key')
pd.concat([df2['data1'],dummies],axis = 1)
| data1 | key_a | key_b | key_c |
0 | 0 | 0.0 | 1.0 | 0.0 |
1 | 1 | 1.0 | 0.0 | 0.0 |
2 | 2 | 0.0 | 1.0 | 0.0 |
3 | 3 | 0.0 | 0.0 | 1.0 |
4 | 4 | 1.0 | 0.0 | 0.0 |
5 | 5 | 0.0 | 1.0 | 0.0 |
df2[['data1']].join(dummies)#Series没有join
| data1 | key_a | key_b | key_c |
0 | 0 | 0.0 | 1.0 | 0.0 |
1 | 1 | 1.0 | 0.0 | 0.0 |
2 | 2 | 0.0 | 1.0 | 0.0 |
3 | 3 | 0.0 | 0.0 | 1.0 |
4 | 4 | 1.0 | 0.0 | 0.0 |
5 | 5 | 0.0 | 1.0 | 0.0 |
df2[['data1']]#选出的是DataFrame
| data1 |
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
5 | 5 |
df2['data1']#选出的是Series
0 0
1 1
2 2
3 3
4 4
5 5
Name: data1, dtype: int64