Pandas常用函数用法

原创

bruce_xiaowei 2024-04-16 10:56:47 博主文章分类：笔记 ©著作权

文章标签 python 数据结构 pandas 数据 f5 文章分类 Html/CSS 前端开发

©著作权归作者所有：来自51CTO博客作者bruce_xiaowei的原创作品，请联系作者获取转载授权，否则将追究法律责任

Python数据分析：Pandas常用函数用法

import pandas as pd 
import numpy as np 

# apply函数
# 模拟一份数据

df = pd.DataFrame({'name':['peter', 'mike', 'tom'],
                  'sex':[1, 0, 1], 
                  'age':[20, 23, 28]})
df

	name	sex	age
0	peter	1	20
1	mike	0	23
2	tom	1	28

# example1:将名字的首字母大写
df['name'] = df['name'].apply(lambda x: x.title())
df

	name	sex	age
0	Peter	1	20
1	Mike	0	23
2	Tom	1	28

# example2:性别中0-female 1-male
def change_sex(x):
    if x == 0:
        return 'female'
    else:
        return 'male'
    
df['sex'] = df['sex'].apply(change_sex)
df

	name	sex	age
0	Peter	male	20
1	Mike	female	23
2	Tom	male	28

# example3:年龄加5岁
df['age'] = df['age'].apply(lambda x: x+5)
df

	name	sex	age
0	Peter	male	25
1	Mike	female	28
2	Tom	male	33

# between_time函数
# 来自官网的案例
# 官网模拟数据
i = pd.date_range('2022-02-04', periods=5, freq='1D20min') # 1D代表一天
df2 = pd.DataFrame({'A':[1, 2, 3, 4, 5]}, index=i)
df2

	A
2022-02-04 00:00:00	1
2022-02-05 00:20:00	2
2022-02-06 00:40:00	3
2022-02-07 01:00:00	4
2022-02-08 01:20:00	5

df2.between_time("0:45", "0:15")
df2

	A
2022-02-04 00:00:00	1
2022-02-05 00:20:00	2
2022-02-06 00:40:00	3
2022-02-07 01:00:00	4
2022-02-08 01:20:00	5

# name中包含P
df['name'].str.contains('P', regex=False)

0     True
1    False
2    False
Name: name, dtype: bool

# 选择name中带有P的数据
df[df['name'].str.contains('P')]

	name	sex	age
0	Peter	male	25

# drop_duplicates函数
"""
删除数据中的重复值，可以选择根据某个或者多个字段来删除，
在删除数据的时候，默认保留的是第一条重复的数据，我们可以通过参数keep来指定保留最后一条
"""
df # 原数据

	name	sex	age
0	Peter	male	25
1	Mike	female	28
2	Tom	male	33

# 删除sex中重复的数据
df.drop_duplicates('sex')

	name	sex	age
0	Peter	male	25
1	Mike	female	28

# 删除重复数据，指定保留最后一条
df.drop_duplicates('sex', keep='last')

	name	sex	age
1	Mike	female	28
2	Tom	male	33

# expanding函数
"""
这是一个窗口函数，实现的是一种类似累计求和的功能
"""
# 模拟数据
df1 = pd.DataFrame({'age': [10, 11, 12, np.nan, 24]})
df1

	age
0	10.0
1	11.0
2	12.0
3	NaN
4	24.0

# 分别的指定1-2-3不同的窗口数
df1.expanding(1).sum()

	age
0	10.0
1	21.0
2	33.0
3	33.0
4	57.0

df1.expanding(2).sum()

	age
0	NaN
1	21.0
2	33.0
3	33.0
4	57.0

df1.expanding(3).sum()
# 当窗口数大于前面的记录数，则累计和用NaN表示

	age
0	NaN
1	NaN
2	33.0
3	33.0
4	57.0

# filter函数
# 用来进行数据的过滤操作
# items:表示包含的字段
# regex：表示使用正则
# 新数据
df2 = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), #np.array指定行数据
                  index = ['mouse', 'rabbit'],
                  columns = ['one', 'two', 'three'])
df2

	one	two	three
mouse	1	2	3
rabbit	4	5	6

# 过滤选择列属性
df2.filter(items=['one', 'three'])
df2

	one	two	three
mouse	1	2	3
rabbit	4	5	6

# 使用正则：列属性以o结尾的数据
df2.filter(regex='o$', axis=1)  # axis=1 表示列

	two
mouse	2
rabbit	5

# 选择行数据
df2.filter(like='bbi', axis=0)  # axis=0 表示行

	one	two	three
rabbit	4	5	6

# ge 函数
"""
进行比较的一个函数：ge表示greater equal
"""
df

	name	sex	age
0	Peter	male	25
1	Mike	female	28
2	Tom	male	33

df['age'].ge(26)

0    False
1     True
2     True
Name: age, dtype: bool

df[df['age'].ge(26)]

	name	sex	age
1	Mike	female	28
2	Tom	male	33

# hist函数
# pandas内置的绘制直方图的函数
df4 = pd.DataFrame({
    'length': [1.5, 0.5, 1.2, 0.9, 3],
    'width': [0.7, 0.2, 0.15, 0.2,1.1]},
    index=['pig', 'rabbit', 'duck', 'chicken', 'horse'])
hist = df4.hist(bins=3)

Pandas常用函数用法_数据

# iterrows函数
# iterrows函数用于对DataFrame进行迭代循环
df
for index, row in df.iterrows():
    print(index)  # 输出索引
    print(row)  # 输出每行的数据
    print(type(row))  # row的类型：用Series表示

0
name    Peter
sex      male
age        25
Name: 0, dtype: object
<class 'pandas.core.series.Series'>
1
name      Mike
sex     female
age         28
Name: 1, dtype: object
<class 'pandas.core.series.Series'>
2
name     Tom
sex     male
age       33
Name: 2, dtype: object
<class 'pandas.core.series.Series'>

# join函数
# join函数用于合并同的DataFrame
df5 = pd.DataFrame({
    'key':['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
    'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']
})
df5

	key	A
0	K0	A0
1	K1	A1
2	K2	A2
3	K3	A3
4	K4	A4
5	K5	A5

df6 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2'],
    'B': ['B0', 'B1', 'B2']
})
df6

	key	B
0	K0	B0
1	K1	B1
2	K2	B2

# 使用join合并两个DataFrame
df5.join(df6, lsuffix='_df5', rsuffix='_df6')

	key_df5	A	key_df6	B
0	K0	A0	K0	B0
1	K1	A1	K1	B1
2	K2	A2	K2	B2
3	K3	A3	NaN	NaN
4	K4	A4	NaN	NaN
5	K5	A5	NaN	NaN

# kurtosis函数
"""
用于查找一组数据中的峰度值
"""
s = pd.Series([10, 20, 16, 14, 30, 28],
             index=['A', 'B', 'C', 'D', 'E', 'F'])
s

A    10
B    20
C    16
D    14
E    30
F    28
dtype: int64

# 生成数据的峰度值
s.kurtosis()

-1.6446236294803107

# 如果给定的数据中存在缺失值，可以使用参数skipna直接跳过
s1 = pd.Series([10, None, 16, 14, 30, None])
s1

0    10.0
1     NaN
2    16.0
3    14.0
4    30.0
5     NaN
dtype: float64

s1.kurtosis(skipna=True)

2.646199227619398

# last函数
# 这是一个用在基于时间数据选择上的函数
i = pd.date_range('2018-04-09', # 起始日期
                 periods=6,  # 周期
                 freq='2D') # 频率，间隔

i

DatetimeIndex(['2018-04-09', '2018-04-11', '2018-04-13', '2018-04-15',
               '2018-04-17', '2018-04-19'],
              dtype='datetime64[ns]', freq='2D')

# max/min/mean/median4个基于统计概念的函数:最大值、最小值、平均值、中位数
df.max()

name     Tom
sex     male
age       33
dtype: object

df.min()  # 最小值

name      Mike
sex     female
age         25
dtype: object

df.median() # 中位数

/var/folders/lt/ffnv2_hs11714chfdc9jpjsm0000gn/T/ipykernel_43454/252926612.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df.median() # 中位数





age    28.0
dtype: float64

df.mean() #均值

/var/folders/lt/ffnv2_hs11714chfdc9jpjsm0000gn/T/ipykernel_43454/359985242.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df.mean() #均值





age    28.666667
dtype: float64

# nlarge函数
df7 = pd.DataFrame({
  'population': [59000000, 65000000, 434000,434000, 
                 434000, 337000, 11300,11300, 11300],
  'GDP': [1937894, 2583560 , 12011, 4520, 
          12128,17036, 182, 38, 311],
  'alpha-2': ["IT", "FR", "MT", "MV", "BN",
              "IS", "NR", "TV", "AI"]},
  index=["Italy", "France", "Malta",
         "Maldives", "Brunei", "Iceland",
         "Nauru", "Tuvalu", "Anguilla"])
# 记录每个国家的人口数、GDP和名称2位大写
df7

	population	GDP	alpha-2
Italy	59000000	1937894	IT
France	65000000	2583560	FR
Malta	434000	12011	MT
Maldives	434000	4520	MV
Brunei	434000	12128	BN
Iceland	337000	17036	IS
Nauru	11300	182	NR
Tuvalu	11300	38	TV
Anguilla	11300	311	AI

# 选择人口前三的信息，默认情况
df7.nlargest(3, 'population')

	population	GDP	alpha-2
France	65000000	2583560	FR
Italy	59000000	1937894	IT
Malta	434000	12011	MT

# 2, keep='last'
df7.nlargest(3, 'population', keep='last')

	population	GDP	alpha-2
France	65000000	2583560	FR
Italy	59000000	1937894	IT
Brunei	434000	12128	BN

# 3 keep = 'all'
df7.nlargest(3, 'population', keep='all')

	population	GDP	alpha-2
France	65000000	2583560	FR
Italy	59000000	1937894	IT
Malta	434000	12011	MT
Maldives	434000	4520	MV
Brunei	434000	12128	BN

# pop 函数
# 删除前
df

	name	sex	age
0	Peter	male	25
1	Mike	female	28
2	Tom	male	33

# 执行删除操作
df.pop('sex')

0      male
1    female
2      male
Name: sex, dtype: object

# 删除后
df

	name	age
0	Peter	25
1	Mike	28
2	Tom	33

# quantile函数
# quantile就是分位数的意思，函数的语法规则：
DataFrame.quantile(
    q=0.5,  
    axis=0, 
    numeric_only=True,
    interpolation=’linear’)
"""
q: 数字或者是类列表，范围只能在0-1之间，默认是0.5，即中位数-第2四分位数
axis :计算方向，0-index, 1-columns，默认为 0
numeric_only：只允许是数值型数据
interpolation（插值方法）：可以是 {‘linear’, ‘lower’, ‘higher’, ‘midpoint’, ‘nearest’}之一，默认是linear。
"""

df8 = pd.DataFrame({'A':[2, 3, 7, 8, 9, 10]})
df8

	A
0	2
1	3
2	7
3	8
4	9
5	10

# 默认是liner
df8['A'].quantile()

7.5

# 默认为liner
df8['A'].quantile(interpolation='higher')

# reset_index函数

# reset就是重置的含义，index就是行索引；连起来就是重置行索

df9 = pd.DataFrame({"fruit":["苹果","香蕉","橙子","橙子","苹果","橙子"],
                   "amount":[100,200,130,150,88,40]})
df9

	fruit	amount
0	苹果	100
1	香蕉	200
2	橙子	130
3	橙子	150
4	苹果	88
5	橙子	40

# 统计每种水果的总销售额
# 不加reset_index
df9.groupby('fruit')['amount'].sum()  # Series数据

fruit
橙子    320
苹果    188
香蕉    200
Name: amount, dtype: int64

# 加上
df9.groupby('fruit')['amount'].sum().reset_index() # DataFrame数据

	fruit	amount
0	橙子	320
1	苹果	188
2	香蕉	200

#select_dtypes函数
"""
根据字段类型来筛选数据，可以包含或者排除一个或者多个字段类型的数据。
下面是官网的案例，稍作修改：生成了3个不同数据类型的字段
"""
# 来自官网的案例，稍微修改
df10 = pd.DataFrame({'a':[1, 2] * 3,
                    'b': [True, False] *3,
                     'c': [1.0, 2.0] * 3},
                    index=[0, 1, 4, 5, 2, 3] # 添加内容
                   )
df10

	a	b	c
0	1	True	1.0
1	2	False	2.0
4	1	True	1.0
5	2	False	2.0
2	1	True	1.0
3	2	False	2.0

df10.dtypes # 查看数据类型

a      int64
b       bool
c    float64
dtype: object

# 选择数据类型
df10.select_dtypes(include='bool')

	b
0	True
1	False
4	True
5	False
2	True
3	False

# 包含多个类型
df10.select_dtypes(include=['bool', 'int64'])

	a	b
0	1	True
1	2	False
4	1	True
5	2	False
2	1	True
3	2	False

# 排除某个类型
# 不包含bool型
df10.select_dtypes(exclude='bool')

	a	c
0	1	1.0
1	2	2.0
4	1	1.0
5	2	2.0
2	1	1.0
3	2	2.0

# 排除多个类型
df10.select_dtypes(exclude=['bool', 'float64'])

	a
0	1
1	2
4	1
5	2
2	1
3	2

# take函数
"""
也是选择数据的一个函数，具体语法为：
take(indices, axis=0, is_copy=None, **kwargs)
indices：选择位置：数组或者切片
axis：选择的轴，0-index，1-column，默认是0
is_copy：是否返回副本；从Pandas1.0开始
"""
df10

	a	b	c
0	1	True	1.0
1	2	False	2.0
4	1	True	1.0
5	2	False	2.0
2	1	True	1.0
3	2	False	2.0

df10.take([0, 4])

	a	b	c
0	1	True	1.0
2	1	True	1.0

df10.take([0, 2, 5])

	a	b	c
0	1	True	1.0
4	1	True	1.0
3	2	False	2.0

df10.take([-1, -3])

	a	b	c
3	2	False	2.0
5	2	False	2.0

df10.take([1, 2], axis=1)

	b	c
0	True	1.0
1	False	2.0
4	True	1.0
5	False	2.0
2	True	1.0
3	False	2.0

# update函数

# 更新某个DataFrame数据框；模拟两个数据：
df11 = pd.DataFrame({'A': [1, 2, 3],
                    'B': [400, 500, 600]
                    })
df11

	A	B
0	1	400
1	2	500
2	3	600

df12 = pd.DataFrame({'B': [4, 5, 6],
                    'C': [7, 8, 9]
                    })
df12

	B	C
0	4	7
1	5	8
2	6	9

df11.update(df12)
df11

	A	B
0	1	4
1	2	5
2	3	6

# 如果用于更新的数据存在空值，则保持原来的数据不变
df13 = pd.DataFrame(
{'B':[44, None, 66], # 添加空值
'C':[ 7, 8, 9]})
df13

	B	C
0	44.0	7
1	NaN	8
2	66.0	9

df11.update(df13)
df11

	A	B
0	1	44.0
1	2	5.0
2	3	66.0

df11.var()

A      1.000000
B    954.333333
dtype: float64

df11.var(ddof=0)

A      0.666667
B    636.222222
dtype: float64

df11.var(ddof=1)

A      1.000000
B    954.333333
dtype: float64

# where函数

# 用于查找满足条件的数据
w = pd.Series(range(5))
w

0    0
1    1
2    2
3    3
4    4
dtype: int64

# 满足条件的显示；不满足的用空值代替
w.where(w>=2)

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
dtype: float64

# 不满足的用8替代
w.where(w>=2, 8)

0    8
1    8
2    2
3    3
4    4
dtype: int64

备注：以上案例数据来自官网数据

上一篇：lambda函数简介

下一篇：Python编程：编写被动信息搜集之网址的IP及Whois查询

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯