python pandas 的高级操作
- 1、groupby 与 apply的结合使用
- 1)对dataframe进行分组,并将分组结果合并(某列多行变一行)
- 2) 多列list元素 一行变多行
- 3) 对dataframe进行分组,并将分组结果合并后排序或将list转tuple,以及将某一值映射(贴标签)
- 2、groupby 与 pivot_table的区别与联系
- 3、每列缺失值情况
- 4、时间列读取、转换、提取
- 5、筛选数据、删除重复数据、删除列
- 6、各列分布情况
- 4、堆叠柱状图及百分比堆叠柱状图
1、groupby 与 apply的结合使用
- groupby:主要用于分组聚合,可结合统计函数(mean()、min()、sum()、count()、、、)一起使用,跟聚合函数agg()类 似;
- apply:可以利用函数包括自定义函数迭代计算,可以结合lambda使用
- 详情参考 https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html
1)对dataframe进行分组,并将分组结果合并(某列多行变一行)
df = pd.DataFrame(data=[[1,2,'book1'], [1, 2, 'book2'], [4, 5, 'book3']], columns=['ID', 'Day', 'title'])
print(df)
# 多行变一行,合成list
df.groupby(['ID', 'Day'])['title'].apply(list)
# 自定义函数
def fun(row):
tag = eval(row['tag'])
return tag
df['tag'] = df.apply(fun, axis=1)
2) 多列list元素 一行变多行
# 使用eval函数解析list元素
df = pd.read_csv('/mnt/mxm/home/project/gitTest/buyer_info_tag.csv', converters={'label_buyer': eval})
# list 拆分为多行
# 方法一
import numpy as np
new_values = np.dstack((np.repeat(buyer_info.buyer_id.values, list(map(len, buyer_info.month.values))),
np.concatenate(buyer_info.month.values),
np.concatenate(buyer_info.tag.values)))
buyer = pd.DataFrame(data=new_values[0], columns=['buyer_id', 'month', 'tag'])
# 方法二
# 使用explode函数
buyer = buyer_info.explode(['month', 'tag'])
3) 对dataframe进行分组,并将分组结果合并后排序或将list转tuple,以及将某一值映射(贴标签)
test = pd.DataFrame(data=[['1',['物理', '历史', '数学']], ['2', ['历史', '物理', '数学']], ['3', ['历史', '语文', '数学']]], columns=['id', 'subject'])
print(test)
test['subject'].apply(lambda str : str.sort(), str)
print(test)
#test['km'] = test['km'].apply(lambda str : tuple(str))
test['subject_str'] = test['subject'].apply(lambda str : str[0] + '_' + str[1] + '_' + str[2])
print(test)
mapping = {'历史_语文_数学': 'a', '历史_数学_物理':'b', '历史_数学_语文': 'c'}
test['label'] = test['subject_str'].replace(mapping)
print(test)
2、groupby 与 pivot_table的区别与联系
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo","bar", "bar", "bar", "bar"],"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],"C": ["small", "large", "large", "small","small", "large", "small", "small","large"],"D": [1, 2, 2, 3, 3, 4, 5, 6, 7], "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
print(df)
print('-----------------------')
print('------pivot_table效果---数值sum--')
print(pd.pivot_table(df, values='D', index=['A'],columns=['C'], aggfunc=np.sum))
print('----行数len--')
print(pd.pivot_table(df, values='C', index=['A'],columns=['B'], aggfunc=len))
print('----字符串sum--')
print(pd.pivot_table(df, values='C', index=['A'],columns=['B'], aggfunc=np.sum))
print('-------groupby效果--------')
print(df.groupby(by=['A', 'C'])['D'].sum())
print("------groupby 展平效果-----")
temp = df.groupby(by=['A', 'C'])['D'].sum()
temp = temp.reset_index(level = ['A','C']).rename(columns={'D':'D_SUM'})
print(temp)
print('-----------------------')
print('-----pivot_table效果--数值sum----')
print(pd.pivot_table(df, values='D', index=['A','B'],columns=['C'], aggfunc=np.sum))
print('-----pivot_table效果并填充空值----数值sum---')
print(pd.pivot_table(df, values='D', index=['A','B'],columns=['C'], aggfunc=np.sum, fill_value=0))
print('------groupby效果-----')
print(df.groupby(by=['A', 'B', 'C'])['D'].sum())
# 选取科班生
undergraduate_2018 = student_2018[student_2018['PID'].isin([1])].groupby(by=['FROMCARDNO', 'YEAR_MONTH'])['JE'].sum()
# 展平分组
undergraduate_2018 = undergraduate_2018.unstack()
print(undergraduate_2018)
undergraduate_2018.describe()
3、每列缺失值情况
# 每列缺失数量
missing = pd.DataFrame(ad_info.isnull().sum()).rename(columns = {0: 'total'})
# 每列数据缺失比例
missing['percent'] = missing['total'] / len(ad_info)
# 数据缺失比例
missing.sort_values('percent', ascending = False)
4、时间列读取、转换、提取
# 读取数据时,进行时间解析
dateparse = lambda dates: pd.datetime.strptime(dates, '%d/%m/%Y %H:%M:%S')
df = pd.read_csv('/home/mxm/project/gittest/tm_foodied_20216_2022.csv', parse_dates=['order_pay_time'], date_parser=dateparse)
# 列类型object转换时间类型
train['operTime'] = pd.to_datetime(train['operTime'], infer_datetime_format=True)
# 按时间格式,提取年、月、日信息
train['operDate'] = train['operTime'].dt.strftime('%Y-%m-%d')
train['operYear'] = train['operTime'].dt.strftime('%Y')
5、筛选数据、删除重复数据、删除列
# 'PID':47
staff_2018 = df_2018[df_2018['PID'].isin([47])]
staff_2018 = df_2018[df_2018['JE'] >= 0]
staff_2018 = df_2018[(df_2018['JE'] >= 0) & (df_2018['JE'] <= 50)]
# 删除重复数据
staff_2018.drop_duplicates(inplace=True)
print('重复数据的数量:', staff_2018.duplicated().sum())
# 删除列
staff_2018.drop(columns=['spreadAppId'], inplace=True)
6、各列分布情况
for i, col in enumerate(user_info[['age', 'gender', 'city', 'province', 'phoneType','carrier']]):
# 设置子图排列形式,如3行2列
ax = plt.subplot(3, 2, i + 1)
counts = user_info[col].value_counts().sort_index()
# 柱状图
counts.plot.bar(figsize = (8, 6),ax = ax,
edgecolor = 'k', linewidth = 2)
plt.title(f'User {col} distribution')
plt.xlabel(f'{col}')
plt.ylabel('count')
# print(counts)
plt.subplots_adjust()
4、堆叠柱状图及百分比堆叠柱状图
import matplotlib.pyplot as plt
graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for x in graph_name:
graph_data = new_train_9.groupBy(x, 'label').agg({'label': 'count'}).toPandas()
graph_data = pd.pivot_table(graph_data, values='count(label)', index=[x],columns=['label'], aggfunc=np.sum, fill_value=0)
print(graph_data.head())
graph_data[[0,1]].plot(kind='bar', stacked=True, figsize=((16, 10)))
plt.show()
import numpy as np
import matplotlib.pyplot as plt
people = ('G1','G2','G3','G4','G5','G6','G7','G8')
segments = 4
# multi-dimensional data
data = [[ 3.40022085, 7.70632498, 6.4097905, 10.51648577, 7.5330039,
7.1123587, 12.77792868, 3.44773477],
[ 11.24811149, 5.03778215, 6.65808464, 12.32220677, 7.45964195,
6.79685302, 7.24578743, 3.69371847],
[ 3.94253354, 4.74763549, 11.73529246, 4.6465543, 12.9952182,
4.63832778, 11.16849999, 8.56883433],
[ 4.24409799, 12.71746612, 11.3772169, 9.00514257, 10.47084185,
10.97567589, 3.98287652, 8.80552122]]
percentages = (np.random.randint(5,20, (len(people), segments)))
print(percentages)
print(percentages[1,0])
y_pos = np.arange(len(people))
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
colors ='rgwm'
patch_handles = []
# left alignment of data starts at zero
left = np.zeros(len(people))
for i, d in enumerate(data):
patch_handles.append(ax.barh(y_pos, d,
color=colors[i%len(colors)], align='center',
left=left))
left += d
# search all of the bar segments and annotate
for j in range(len(patch_handles)):
print(j)
for i, patch in enumerate(patch_handles[j].get_children()):
# print(i)
bl = patch.get_xy()
x = 0.5*patch.get_width() + bl[0]
y = 0.5*patch.get_height() + bl[1]
ax.text(x,y, "%d%%" % (percentages[i,j]), ha='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.set_xlabel('Scores')
plt.show()
import matplotlib.pyplot as plt
graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for x in graph_name:
graph_data = new_train_9.groupBy(x, 'label').agg({'label': 'count'}).toPandas()
graph_data = pd.pivot_table(graph_data, values='count(label)', index=[x],columns=['label'], aggfunc=np.sum, fill_value=0)
print(graph_data.head())
# Create a figure with a single subplot
f, ax = plt.subplots(1, figsize=((16, 10)))
# Create the total ad counts for each participant
totals = [i+j for i,j in zip(graph_data[0], graph_data[1])]
# Create the percentage of the total ad counts the ad-colicks counts value for each participant was
unclick = [i / j * 100 for i,j in zip(graph_data[0], totals)]
click = [i / j * 100 for i,j in zip(graph_data[1], totals)]
ax.bar(graph_data.index,
# using unclick(0) data
unclick,
# labeled
label='0',
# with border color
edgecolor='white'
)
ax.bar(graph_data.index,
# using unclick(0) data
click,
# labeled
label='1',
# with unclick
bottom=unclick,
# with border color
edgecolor='white'
)
ax.set_ylabel("Percentage")
ax.set_xlabel(x)
plt.show()
import matplotlib.pyplot as plt
graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for name in graph_name:
graph_data = new_train_9.groupBy(name, 'label').agg({'label': 'count'}).toPandas()
graph_data = pd.pivot_table(graph_data, values='count(label)', index=[name],columns=['label'], aggfunc=np.sum, fill_value=0)
print(graph_data.head())
# Create a figure with a single subplot
f, ax = plt.subplots(1, figsize=((16, 10)))
# Create the total ad counts for each participant
totals = [i+j for i,j in zip(graph_data[0], graph_data[1])]
# Create the percentage of the total ad counts the ad-colicks counts value for each participant was
unclick = [i / j * 100 for i,j in zip(graph_data[0], totals)]
click = [i / j * 100 for i,j in zip(graph_data[1], totals)]
patch_handles = []
patch_handles.append(ax.bar(graph_data.index,
# using unclick(0) data
unclick,
# labeled
label='0',
# with border color
edgecolor='white'))
patch_handles.append(ax.bar(graph_data.index,
# using unclick(0) data
click,
# labeled
label='1',
# with unclick
bottom=unclick,
# with border color
edgecolor='white'
))
percentages = []
percentages.append(unclick)
percentages.append(click)
# search all of the bar segments and annotate
for j in range(len(patch_handles)):
for i, patch in enumerate(patch_handles[j].get_children()):
bl = patch.get_xy()
x = 0.5*patch.get_width() + bl[0]
y = 0.5*patch.get_height() + bl[1]
ax.text(x,y, "%d%%" % (percentages[j][i]), ha='center')
ax.set_ylabel("Percentage")
ax.set_xlabel(name)
plt.show()