文章目录

DataFrame基本列操作

import pandas as pd
import numpy as np

# 设置实验数据
l = [["p", 12, 12, 0], [np.nan, 12.3, 33., 0], ["q", 12.3, 0, 0], ["r", 1, 1, 0]]
df = pd.DataFrame(l, columns=["a", "b", "c", "d"])

print(df)
"""
a b c d
0 p 12.0 12.0 0
1 NaN 12.3 33.0 0
2 q 12.3 0.0 0
3 r 1.0 1.0 0
"""


# 全为数字的列直接进行运算
df['c']=df['c']*8
print(df)
"""
a b c d
0 p 12.0 768.0 0
1 NaN 12.3 2112.0 0
2 q 12.3 0.0 0
3 r 1.0 64.0 0
"""

# 对两列也可以进行运算(注意数据类型)
df['b*c']=df['b']*df['c']
print(df)

"""
a b c d b*c
0 p 12.0 768.0 0 9216.0
1 NaN 12.3 2112.0 0 25977.6
2 q 12.3 0.0 0 0.0
3 r 1.0 64.0 0 64.0
"""

# 含有字符串的列也有类似操作
df['a*8'] = df['a']*8
print(df)

"""
a b c d b*c a*8
0 p 12.0 768.0 0 9216.0 pppppppp
1 NaN 12.3 2112.0 0 25977.6 NaN
2 q 12.3 0.0 0 0.0 qqqqqqqq
3 r 1.0 64.0 0 64.0 rrrrrrrr
"""

# 列值上移或者下移
df['b'].shift(1) # 整体下移一行(默认参数即为1)
"""
0 NaN
1 12.0
2 12.3
3 12.3
Name: b, dtype: float64
"""
df['b'].shift(-1) # 整体上移一行
"""
0 12.3
1 12.3
2 1.0
3 NaN
Name: b, dtype: float64
"""

# 向下取得逐行差值(与.shift()方法的参数类似取负值为向上)
df['b'].diff(1)
"""
0 NaN
1 0.3
2 0.0
3 -11.3
Name: b, dtype: float64
"""

# 获得涨跌幅
df['b'].pct_change()
"""
0 NaN
1 0.025000
2 0.000000
3 -0.918699
Name: b, dtype: float64
"""


# 排序:获得排名
df['c'].rank()
"""
0 3.0
1 4.0
2 1.0
3 2.0
Name: c, dtype: float64
"""

DataFrame常用列处理

# 仍然沿用上一小节的数据

# 去掉全为零的列
df1 = df.loc[:, (df!=0).any()]
# df1 = df.drop(df.loc[:, (df==0).all()], axis=1) # 第二种方法
print(df1)
"""
a b c
0 p 12.0 12.0
1 NaN 12.3 33.0
2 q 12.3 0.0
3 r 1.0 1.0
"""

# 将列中的0值用该列的平均值(不包含0)替代
# 先将0值改变成NaN
df.loc[:,'c'] = df.loc[:,'c'].apply(lambda x: np.nan if x==0 else x)
# 再将NaN用每一列的Mean填充
df.loc[:,'c'].fillna(value=df.loc[:,'c'].mean(), inplace=True)

print(df)
"""
a b c d
0 p 12.0 12.000000 0
1 NaN 12.3 33.000000 0
2 q 12.3 15.333333 0
3 r 1.0 1.000000 0
"""