python_缺失值处理

Data Cleaning and Preparation
# pandas使⽤浮点值
# NaN(Not a Number)表示缺失数据。我们称其为哨兵值
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)
Handling Missing Data
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
# 判断是否是空值
string_data.isnull()
0 False
1 False
2 True
3 False
dtype: bool
string_data[0] = None
string_data.isnull()
0 True
1 False
2 True
3 False
dtype: bool
Filtering Out Missing Data
# 滤除缺失数据
# 过滤掉缺失数据的办法有很多种。你可以通过pandas.isnull或布
# 尔索引的⼿⼯⽅法,但dropna可能会更实⽤⼀些。对于⼀个
# Series,dropna返回⼀个仅含⾮空数据和索引值的Series: 删除空行
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.head()
data.dropna()
0 1.0
2 3.5
4 7.0
dtype: float64
data[data.notnull()]
0 1.0
2 3.5
4 7.0
dtype: float64
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data
cleaned
0 1 2
0 1.0 6.5 3.0
# 传⼊how='all'将只丢弃全为NA的那些⾏:
data.dropna(how='all')
data[4] = NA
data
data.dropna(axis=1, how='all')
# 部分数据赋予空值
# ⽤thresh参数实现此⽬的
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
df.dropna()
df.dropna(thresh=2)
0 1 2
2 1.634736 NaN 0.457940
3 0.555154 NaN -0.440554
4 -0.301350 0.498791 -0.823991
5 1.320566 0.507965 -0.653438
6 0.186980 -0.391725 -0.272293
Filling In Missing Data
# 填充缺失数据
df.fillna(0)
0 1 2
0 0.107657 0.000000 0.000000
1 -0.017007 0.000000 0.000000
2 1.634736 0.000000 0.457940
3 0.555154 0.000000 -0.440554
4 -0.301350 0.498791 -0.823991
5 1.320566 0.507965 -0.653438
6 0.186980 -0.391725 -0.272293
通过⼀个字典调⽤fillna,就可以实现对不同的列填充不同的
# 值:
# 通过⼀个字典调⽤fillna,就可以实现对不同的列填充不同的
# 值:
df.fillna({1: 0.5, 2: 0})
0 1 2
0 0.107657 0.500000 0.000000
1 -0.017007 0.500000 0.000000
2 1.634736 0.500000 0.457940
3 0.555154 0.500000 -0.440554
4 -0.301350 0.498791 -0.823991
5 1.320566 0.507965 -0.653438
6 0.186980 -0.391725 -0.272293
# 对现有对象进⾏就地修改
_ = df.fillna(0, inplace=True)
df
0 1 2
0 0.107657 0.000000 0.000000
1 -0.017007 0.000000 0.000000
2 1.634736 0.000000 0.457940
3 0.555154 0.000000 -0.440554
4 -0.301350 0.498791 -0.823991
5 1.320566 0.507965 -0.653438
6 0.186980 -0.391725 -0.272293
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)
data = pd.Series([1., NA, 3.5, NA, 7])
# 传⼊Series的平均值或中位数
data.fillna(data.mean())
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000