Python数据分析:Pandas常用函数用法
import pandas as pd
import numpy as np
# apply函数
# 模拟一份数据
df = pd.DataFrame({'name':['peter', 'mike', 'tom'],
'sex':[1, 0, 1],
'age':[20, 23, 28]})
df
name | sex | age | |
0 | peter | 1 | 20 |
1 | mike | 0 | 23 |
2 | tom | 1 | 28 |
# example1:将名字的首字母大写
df['name'] = df['name'].apply(lambda x: x.title())
df
name | sex | age | |
0 | Peter | 1 | 20 |
1 | Mike | 0 | 23 |
2 | Tom | 1 | 28 |
# example2:性别中0-female 1-male
def change_sex(x):
if x == 0:
return 'female'
else:
return 'male'
df['sex'] = df['sex'].apply(change_sex)
df
name | sex | age | |
0 | Peter | male | 20 |
1 | Mike | female | 23 |
2 | Tom | male | 28 |
# example3:年龄加5岁
df['age'] = df['age'].apply(lambda x: x+5)
df
name | sex | age | |
0 | Peter | male | 25 |
1 | Mike | female | 28 |
2 | Tom | male | 33 |
# between_time函数
# 来自官网的案例
# 官网模拟数据
i = pd.date_range('2022-02-04', periods=5, freq='1D20min') # 1D代表一天
df2 = pd.DataFrame({'A':[1, 2, 3, 4, 5]}, index=i)
df2
A | |
2022-02-04 00:00:00 | 1 |
2022-02-05 00:20:00 | 2 |
2022-02-06 00:40:00 | 3 |
2022-02-07 01:00:00 | 4 |
2022-02-08 01:20:00 | 5 |
df2.between_time("0:45", "0:15")
df2
A | |
2022-02-04 00:00:00 | 1 |
2022-02-05 00:20:00 | 2 |
2022-02-06 00:40:00 | 3 |
2022-02-07 01:00:00 | 4 |
2022-02-08 01:20:00 | 5 |
# name中包含P
df['name'].str.contains('P', regex=False)
0 True
1 False
2 False
Name: name, dtype: bool
# 选择name中带有P的数据
df[df['name'].str.contains('P')]
name | sex | age | |
0 | Peter | male | 25 |
# drop_duplicates函数
"""
删除数据中的重复值,可以选择根据某个或者多个字段来删除,
在删除数据的时候,默认保留的是第一条重复的数据,我们可以通过参数keep来指定保留最后一条
"""
df # 原数据
name | sex | age | |
0 | Peter | male | 25 |
1 | Mike | female | 28 |
2 | Tom | male | 33 |
# 删除sex中重复的数据
df.drop_duplicates('sex')
name | sex | age | |
0 | Peter | male | 25 |
1 | Mike | female | 28 |
# 删除重复数据,指定保留最后一条
df.drop_duplicates('sex', keep='last')
name | sex | age | |
1 | Mike | female | 28 |
2 | Tom | male | 33 |
# expanding函数
"""
这是一个窗口函数,实现的是一种类似累计求和的功能
"""
# 模拟数据
df1 = pd.DataFrame({'age': [10, 11, 12, np.nan, 24]})
df1
age | |
0 | 10.0 |
1 | 11.0 |
2 | 12.0 |
3 | NaN |
4 | 24.0 |
# 分别的指定1-2-3不同的窗口数
df1.expanding(1).sum()
age | |
0 | 10.0 |
1 | 21.0 |
2 | 33.0 |
3 | 33.0 |
4 | 57.0 |
df1.expanding(2).sum()
age | |
0 | NaN |
1 | 21.0 |
2 | 33.0 |
3 | 33.0 |
4 | 57.0 |
df1.expanding(3).sum()
# 当窗口数大于前面的记录数,则累计和用NaN表示
age | |
0 | NaN |
1 | NaN |
2 | 33.0 |
3 | 33.0 |
4 | 57.0 |
# filter函数
# 用来进行数据的过滤操作
# items:表示包含的字段
# regex:表示使用正则
# 新数据
df2 = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), #np.array指定行数据
index = ['mouse', 'rabbit'],
columns = ['one', 'two', 'three'])
df2
one | two | three | |
mouse | 1 | 2 | 3 |
rabbit | 4 | 5 | 6 |
# 过滤选择列属性
df2.filter(items=['one', 'three'])
df2
one | two | three | |
mouse | 1 | 2 | 3 |
rabbit | 4 | 5 | 6 |
# 使用正则:列属性以o结尾的数据
df2.filter(regex='o$', axis=1) # axis=1 表示列
two | |
mouse | 2 |
rabbit | 5 |
# 选择行数据
df2.filter(like='bbi', axis=0) # axis=0 表示行
one | two | three | |
rabbit | 4 | 5 | 6 |
# ge 函数
"""
进行比较的一个函数:ge表示greater equal
"""
df
name | sex | age | |
0 | Peter | male | 25 |
1 | Mike | female | 28 |
2 | Tom | male | 33 |
df['age'].ge(26)
0 False
1 True
2 True
Name: age, dtype: bool
df[df['age'].ge(26)]
name | sex | age | |
1 | Mike | female | 28 |
2 | Tom | male | 33 |
# hist函数
# pandas内置的绘制直方图的函数
df4 = pd.DataFrame({
'length': [1.5, 0.5, 1.2, 0.9, 3],
'width': [0.7, 0.2, 0.15, 0.2,1.1]},
index=['pig', 'rabbit', 'duck', 'chicken', 'horse'])
hist = df4.hist(bins=3)
# iterrows函数
# iterrows函数用于对DataFrame进行迭代循环
df
for index, row in df.iterrows():
print(index) # 输出索引
print(row) # 输出每行的数据
print(type(row)) # row的类型:用Series表示
0
name Peter
sex male
age 25
Name: 0, dtype: object
<class 'pandas.core.series.Series'>
1
name Mike
sex female
age 28
Name: 1, dtype: object
<class 'pandas.core.series.Series'>
2
name Tom
sex male
age 33
Name: 2, dtype: object
<class 'pandas.core.series.Series'>
# join函数
# join函数用于合并同的DataFrame
df5 = pd.DataFrame({
'key':['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']
})
df5
key | A | |
0 | K0 | A0 |
1 | K1 | A1 |
2 | K2 | A2 |
3 | K3 | A3 |
4 | K4 | A4 |
5 | K5 | A5 |
df6 = pd.DataFrame({
'key': ['K0', 'K1', 'K2'],
'B': ['B0', 'B1', 'B2']
})
df6
key | B | |
0 | K0 | B0 |
1 | K1 | B1 |
2 | K2 | B2 |
# 使用join合并两个DataFrame
df5.join(df6, lsuffix='_df5', rsuffix='_df6')
key_df5 | A | key_df6 | B | |
0 | K0 | A0 | K0 | B0 |
1 | K1 | A1 | K1 | B1 |
2 | K2 | A2 | K2 | B2 |
3 | K3 | A3 | NaN | NaN |
4 | K4 | A4 | NaN | NaN |
5 | K5 | A5 | NaN | NaN |
# kurtosis函数
"""
用于查找一组数据中的峰度值
"""
s = pd.Series([10, 20, 16, 14, 30, 28],
index=['A', 'B', 'C', 'D', 'E', 'F'])
s
A 10
B 20
C 16
D 14
E 30
F 28
dtype: int64
# 生成数据的峰度值
s.kurtosis()
-1.6446236294803107
# 如果给定的数据中存在缺失值,可以使用参数skipna直接跳过
s1 = pd.Series([10, None, 16, 14, 30, None])
s1
0 10.0
1 NaN
2 16.0
3 14.0
4 30.0
5 NaN
dtype: float64
s1.kurtosis(skipna=True)
2.646199227619398
# last函数
# 这是一个用在基于时间数据选择上的函数
i = pd.date_range('2018-04-09', # 起始日期
periods=6, # 周期
freq='2D') # 频率,间隔
i
DatetimeIndex(['2018-04-09', '2018-04-11', '2018-04-13', '2018-04-15',
'2018-04-17', '2018-04-19'],
dtype='datetime64[ns]', freq='2D')
# max/min/mean/median4个基于统计概念的函数:最大值、最小值、平均值、中位数
df.max()
name Tom
sex male
age 33
dtype: object
df.min() # 最小值
name Mike
sex female
age 25
dtype: object
df.median() # 中位数
/var/folders/lt/ffnv2_hs11714chfdc9jpjsm0000gn/T/ipykernel_43454/252926612.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
df.median() # 中位数
age 28.0
dtype: float64
df.mean() #均值
/var/folders/lt/ffnv2_hs11714chfdc9jpjsm0000gn/T/ipykernel_43454/359985242.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
df.mean() #均值
age 28.666667
dtype: float64
# nlarge函数
df7 = pd.DataFrame({
'population': [59000000, 65000000, 434000,434000,
434000, 337000, 11300,11300, 11300],
'GDP': [1937894, 2583560 , 12011, 4520,
12128,17036, 182, 38, 311],
'alpha-2': ["IT", "FR", "MT", "MV", "BN",
"IS", "NR", "TV", "AI"]},
index=["Italy", "France", "Malta",
"Maldives", "Brunei", "Iceland",
"Nauru", "Tuvalu", "Anguilla"])
# 记录每个国家的人口数、GDP和名称2位大写
df7
population | GDP | alpha-2 | |
Italy | 59000000 | 1937894 | IT |
France | 65000000 | 2583560 | FR |
Malta | 434000 | 12011 | MT |
Maldives | 434000 | 4520 | MV |
Brunei | 434000 | 12128 | BN |
Iceland | 337000 | 17036 | IS |
Nauru | 11300 | 182 | NR |
Tuvalu | 11300 | 38 | TV |
Anguilla | 11300 | 311 | AI |
# 选择人口前三的信息,默认情况
df7.nlargest(3, 'population')
population | GDP | alpha-2 | |
France | 65000000 | 2583560 | FR |
Italy | 59000000 | 1937894 | IT |
Malta | 434000 | 12011 | MT |
# 2, keep='last'
df7.nlargest(3, 'population', keep='last')
population | GDP | alpha-2 | |
France | 65000000 | 2583560 | FR |
Italy | 59000000 | 1937894 | IT |
Brunei | 434000 | 12128 | BN |
# 3 keep = 'all'
df7.nlargest(3, 'population', keep='all')
population | GDP | alpha-2 | |
France | 65000000 | 2583560 | FR |
Italy | 59000000 | 1937894 | IT |
Malta | 434000 | 12011 | MT |
Maldives | 434000 | 4520 | MV |
Brunei | 434000 | 12128 | BN |
# pop 函数
# 删除前
df
name | sex | age | |
0 | Peter | male | 25 |
1 | Mike | female | 28 |
2 | Tom | male | 33 |
# 执行删除操作
df.pop('sex')
0 male
1 female
2 male
Name: sex, dtype: object
# 删除后
df
name | age | |
0 | Peter | 25 |
1 | Mike | 28 |
2 | Tom | 33 |
# quantile函数
# quantile就是分位数的意思,函数的语法规则:
DataFrame.quantile(
q=0.5,
axis=0,
numeric_only=True,
interpolation=’linear’)
"""
q: 数字或者是类列表,范围只能在0-1之间,默认是0.5,即中位数-第2四分位数
axis :计算方向,0-index, 1-columns,默认为 0
numeric_only:只允许是数值型数据
interpolation(插值方法):可以是 {‘linear’, ‘lower’, ‘higher’, ‘midpoint’, ‘nearest’}之一,默认是linear。
"""
df8 = pd.DataFrame({'A':[2, 3, 7, 8, 9, 10]})
df8
A | |
0 | 2 |
1 | 3 |
2 | 7 |
3 | 8 |
4 | 9 |
5 | 10 |
# 默认是liner
df8['A'].quantile()
7.5
# 默认为liner
df8['A'].quantile(interpolation='higher')
8
# reset_index函数
# reset就是重置的含义,index就是行索引;连起来就是重置行索
df9 = pd.DataFrame({"fruit":["苹果","香蕉","橙子","橙子","苹果","橙子"],
"amount":[100,200,130,150,88,40]})
df9
fruit | amount | |
0 | 苹果 | 100 |
1 | 香蕉 | 200 |
2 | 橙子 | 130 |
3 | 橙子 | 150 |
4 | 苹果 | 88 |
5 | 橙子 | 40 |
# 统计每种水果的总销售额
# 不加reset_index
df9.groupby('fruit')['amount'].sum() # Series数据
fruit
橙子 320
苹果 188
香蕉 200
Name: amount, dtype: int64
# 加上
df9.groupby('fruit')['amount'].sum().reset_index() # DataFrame数据
fruit | amount | |
0 | 橙子 | 320 |
1 | 苹果 | 188 |
2 | 香蕉 | 200 |
#select_dtypes函数
"""
根据字段类型来筛选数据,可以包含或者排除一个或者多个字段类型的数据。
下面是官网的案例,稍作修改:生成了3个不同数据类型的字段
"""
# 来自官网的案例,稍微修改
df10 = pd.DataFrame({'a':[1, 2] * 3,
'b': [True, False] *3,
'c': [1.0, 2.0] * 3},
index=[0, 1, 4, 5, 2, 3] # 添加内容
)
df10
a | b | c | |
0 | 1 | True | 1.0 |
1 | 2 | False | 2.0 |
4 | 1 | True | 1.0 |
5 | 2 | False | 2.0 |
2 | 1 | True | 1.0 |
3 | 2 | False | 2.0 |
df10.dtypes # 查看数据类型
a int64
b bool
c float64
dtype: object
# 选择数据类型
df10.select_dtypes(include='bool')
b | |
0 | True |
1 | False |
4 | True |
5 | False |
2 | True |
3 | False |
# 包含多个类型
df10.select_dtypes(include=['bool', 'int64'])
a | b | |
0 | 1 | True |
1 | 2 | False |
4 | 1 | True |
5 | 2 | False |
2 | 1 | True |
3 | 2 | False |
# 排除某个类型
# 不包含bool型
df10.select_dtypes(exclude='bool')
a | c | |
0 | 1 | 1.0 |
1 | 2 | 2.0 |
4 | 1 | 1.0 |
5 | 2 | 2.0 |
2 | 1 | 1.0 |
3 | 2 | 2.0 |
# 排除多个类型
df10.select_dtypes(exclude=['bool', 'float64'])
a | |
0 | 1 |
1 | 2 |
4 | 1 |
5 | 2 |
2 | 1 |
3 | 2 |
# take函数
"""
也是选择数据的一个函数,具体语法为:
take(indices, axis=0, is_copy=None, **kwargs)
indices:选择位置:数组或者切片
axis:选择的轴,0-index,1-column,默认是0
is_copy:是否返回副本;从Pandas1.0开始
"""
df10
a | b | c | |
0 | 1 | True | 1.0 |
1 | 2 | False | 2.0 |
4 | 1 | True | 1.0 |
5 | 2 | False | 2.0 |
2 | 1 | True | 1.0 |
3 | 2 | False | 2.0 |
df10.take([0, 4])
a | b | c | |
0 | 1 | True | 1.0 |
2 | 1 | True | 1.0 |
df10.take([0, 2, 5])
a | b | c | |
0 | 1 | True | 1.0 |
4 | 1 | True | 1.0 |
3 | 2 | False | 2.0 |
df10.take([-1, -3])
a | b | c | |
3 | 2 | False | 2.0 |
5 | 2 | False | 2.0 |
df10.take([1, 2], axis=1)
b | c | |
0 | True | 1.0 |
1 | False | 2.0 |
4 | True | 1.0 |
5 | False | 2.0 |
2 | True | 1.0 |
3 | False | 2.0 |
# update函数
# 更新某个DataFrame数据框;模拟两个数据:
df11 = pd.DataFrame({'A': [1, 2, 3],
'B': [400, 500, 600]
})
df11
A | B | |
0 | 1 | 400 |
1 | 2 | 500 |
2 | 3 | 600 |
df12 = pd.DataFrame({'B': [4, 5, 6],
'C': [7, 8, 9]
})
df12
B | C | |
0 | 4 | 7 |
1 | 5 | 8 |
2 | 6 | 9 |
df11.update(df12)
df11
A | B | |
0 | 1 | 4 |
1 | 2 | 5 |
2 | 3 | 6 |
# 如果用于更新的数据存在空值,则保持原来的数据不变
df13 = pd.DataFrame(
{'B':[44, None, 66], # 添加空值
'C':[ 7, 8, 9]})
df13
B | C | |
0 | 44.0 | 7 |
1 | NaN | 8 |
2 | 66.0 | 9 |
df11.update(df13)
df11
A | B | |
0 | 1 | 44.0 |
1 | 2 | 5.0 |
2 | 3 | 66.0 |
df11.var()
A 1.000000
B 954.333333
dtype: float64
df11.var(ddof=0)
A 0.666667
B 636.222222
dtype: float64
df11.var(ddof=1)
A 1.000000
B 954.333333
dtype: float64
# where函数
# 用于查找满足条件的数据
w = pd.Series(range(5))
w
0 0
1 1
2 2
3 3
4 4
dtype: int64
# 满足条件的显示;不满足的用空值代替
w.where(w>=2)
0 NaN
1 NaN
2 2.0
3 3.0
4 4.0
dtype: float64
# 不满足的用8替代
w.where(w>=2, 8)
0 8
1 8
2 2
3 3
4 4
dtype: int64
备注:以上案例数据来自官网数据