python pandas 的高级操作

  • 1、groupby 与 apply的结合使用
  • 1)对dataframe进行分组,并将分组结果合并(某列多行变一行)
  • 2) 多列list元素 一行变多行
  • 3) 对dataframe进行分组,并将分组结果合并后排序或将list转tuple,以及将某一值映射(贴标签)
  • 2、groupby 与 pivot_table的区别与联系
  • 3、每列缺失值情况
  • 4、时间列读取、转换、提取
  • 5、筛选数据、删除重复数据、删除列
  • 6、各列分布情况
  • 4、堆叠柱状图及百分比堆叠柱状图


1、groupby 与 apply的结合使用

  • groupby:主要用于分组聚合,可结合统计函数(mean()、min()、sum()、count()、、、)一起使用,跟聚合函数agg()类 似;
  • apply:可以利用函数包括自定义函数迭代计算,可以结合lambda使用
  • 详情参考 https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html
1)对dataframe进行分组,并将分组结果合并(某列多行变一行)
df = pd.DataFrame(data=[[1,2,'book1'], [1, 2, 'book2'], [4, 5, 'book3']], columns=['ID', 'Day', 'title'])
print(df)
# 多行变一行,合成list
df.groupby(['ID', 'Day'])['title'].apply(list)

# 自定义函数
def fun(row):
    tag = eval(row['tag'])
    return tag
df['tag'] = df.apply(fun, axis=1)

python dataframe如何分组并求相关系数 python dataframe分组求和_柱状图

2) 多列list元素 一行变多行
# 使用eval函数解析list元素
df = pd.read_csv('/mnt/mxm/home/project/gitTest/buyer_info_tag.csv', converters={'label_buyer': eval})

# list 拆分为多行
# 方法一
import numpy as np
new_values = np.dstack((np.repeat(buyer_info.buyer_id.values, list(map(len, buyer_info.month.values))),
                       np.concatenate(buyer_info.month.values),
                       np.concatenate(buyer_info.tag.values)))
buyer = pd.DataFrame(data=new_values[0], columns=['buyer_id', 'month', 'tag'])
# 方法二
# 使用explode函数
buyer = buyer_info.explode(['month', 'tag'])
3) 对dataframe进行分组,并将分组结果合并后排序或将list转tuple,以及将某一值映射(贴标签)
test = pd.DataFrame(data=[['1',['物理', '历史', '数学']], ['2', ['历史', '物理', '数学']], ['3', ['历史', '语文', '数学']]], columns=['id', 'subject'])
print(test)

test['subject'].apply(lambda str : str.sort(), str)
print(test)

#test['km'] = test['km'].apply(lambda str : tuple(str))
test['subject_str'] = test['subject'].apply(lambda str : str[0] + '_' + str[1] + '_' + str[2])
print(test)

mapping = {'历史_语文_数学': 'a', '历史_数学_物理':'b', '历史_数学_语文': 'c'}
test['label'] = test['subject_str'].replace(mapping)
print(test)

python dataframe如何分组并求相关系数 python dataframe分组求和_柱状图_02

2、groupby 与 pivot_table的区别与联系

df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo","bar", "bar", "bar", "bar"],"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],"C": ["small", "large", "large", "small","small", "large", "small", "small","large"],"D": [1, 2, 2, 3, 3, 4, 5, 6, 7], "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
print(df)
print('-----------------------')
print('------pivot_table效果---数值sum--')
print(pd.pivot_table(df, values='D', index=['A'],columns=['C'], aggfunc=np.sum))
print('----行数len--')
print(pd.pivot_table(df, values='C', index=['A'],columns=['B'], aggfunc=len))
print('----字符串sum--')
print(pd.pivot_table(df, values='C', index=['A'],columns=['B'], aggfunc=np.sum))
print('-------groupby效果--------')
print(df.groupby(by=['A', 'C'])['D'].sum())
print("------groupby 展平效果-----")
temp = df.groupby(by=['A', 'C'])['D'].sum()
temp = temp.reset_index(level = ['A','C']).rename(columns={'D':'D_SUM'})
print(temp)
print('-----------------------')
print('-----pivot_table效果--数值sum----')
print(pd.pivot_table(df, values='D', index=['A','B'],columns=['C'], aggfunc=np.sum))
print('-----pivot_table效果并填充空值----数值sum---')
print(pd.pivot_table(df, values='D', index=['A','B'],columns=['C'], aggfunc=np.sum, fill_value=0))
print('------groupby效果-----')
print(df.groupby(by=['A', 'B', 'C'])['D'].sum())

# 选取科班生
undergraduate_2018 = student_2018[student_2018['PID'].isin([1])].groupby(by=['FROMCARDNO', 'YEAR_MONTH'])['JE'].sum()
# 展平分组
undergraduate_2018 = undergraduate_2018.unstack()
print(undergraduate_2018)
undergraduate_2018.describe()

python dataframe如何分组并求相关系数 python dataframe分组求和_数据分析_03


python dataframe如何分组并求相关系数 python dataframe分组求和_pandas_04


python dataframe如何分组并求相关系数 python dataframe分组求和_柱状图_05

3、每列缺失值情况

# 每列缺失数量
missing = pd.DataFrame(ad_info.isnull().sum()).rename(columns = {0: 'total'})

# 每列数据缺失比例
missing['percent'] = missing['total'] / len(ad_info)
# 数据缺失比例
missing.sort_values('percent', ascending = False)

python dataframe如何分组并求相关系数 python dataframe分组求和_ci_06

4、时间列读取、转换、提取

# 读取数据时,进行时间解析
dateparse = lambda dates: pd.datetime.strptime(dates, '%d/%m/%Y %H:%M:%S')
df = pd.read_csv('/home/mxm/project/gittest/tm_foodied_20216_2022.csv', parse_dates=['order_pay_time'], date_parser=dateparse)
# 列类型object转换时间类型
train['operTime'] = pd.to_datetime(train['operTime'], infer_datetime_format=True)
# 按时间格式,提取年、月、日信息
train['operDate'] = train['operTime'].dt.strftime('%Y-%m-%d')
train['operYear'] = train['operTime'].dt.strftime('%Y')

5、筛选数据、删除重复数据、删除列

# 'PID':47
staff_2018 = df_2018[df_2018['PID'].isin([47])]
staff_2018 = df_2018[df_2018['JE'] >= 0]
staff_2018 = df_2018[(df_2018['JE'] >= 0) & (df_2018['JE'] <= 50)]
# 删除重复数据
staff_2018.drop_duplicates(inplace=True)
print('重复数据的数量:', staff_2018.duplicated().sum())
# 删除列
staff_2018.drop(columns=['spreadAppId'], inplace=True)

6、各列分布情况

for i, col in enumerate(user_info[['age', 'gender', 'city', 'province', 'phoneType','carrier']]):
    # 设置子图排列形式,如3行2列
    ax = plt.subplot(3, 2, i + 1)
    counts = user_info[col].value_counts().sort_index()

    # 柱状图
    counts.plot.bar(figsize = (8, 6),ax = ax,
                      edgecolor = 'k', linewidth = 2)

    plt.title(f'User {col} distribution')
    plt.xlabel(f'{col}')
    plt.ylabel('count')

#     print(counts)
plt.subplots_adjust()

python dataframe如何分组并求相关系数 python dataframe分组求和_柱状图_07

4、堆叠柱状图及百分比堆叠柱状图

import matplotlib.pyplot as plt

graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
              'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for x in graph_name:
    graph_data = new_train_9.groupBy(x, 'label').agg({'label': 'count'}).toPandas()
    graph_data = pd.pivot_table(graph_data, values='count(label)', index=[x],columns=['label'], aggfunc=np.sum, fill_value=0)
    print(graph_data.head())
    graph_data[[0,1]].plot(kind='bar', stacked=True, figsize=((16, 10)))
    plt.show()

python dataframe如何分组并求相关系数 python dataframe分组求和_ci_08

import numpy as np
import matplotlib.pyplot as plt

people = ('G1','G2','G3','G4','G5','G6','G7','G8')
segments = 4

# multi-dimensional data 
data = [[  3.40022085,   7.70632498,   6.4097905,   10.51648577,   7.5330039,
    7.1123587,   12.77792868,   3.44773477],
 [ 11.24811149,   5.03778215,   6.65808464,  12.32220677,   7.45964195,
    6.79685302,   7.24578743,   3.69371847],
 [  3.94253354,   4.74763549,  11.73529246,   4.6465543,   12.9952182,
    4.63832778,  11.16849999,   8.56883433],
 [  4.24409799,  12.71746612,  11.3772169,    9.00514257,  10.47084185,
   10.97567589,   3.98287652,   8.80552122]]
percentages = (np.random.randint(5,20, (len(people), segments)))
print(percentages)
print(percentages[1,0])
y_pos = np.arange(len(people))

fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)

colors ='rgwm'
patch_handles = []
# left alignment of data starts at zero
left = np.zeros(len(people)) 
for i, d in enumerate(data):
    patch_handles.append(ax.barh(y_pos, d, 
      color=colors[i%len(colors)], align='center', 
      left=left))
    left += d

# search all of the bar segments and annotate
for j in range(len(patch_handles)):
    print(j)
    for i, patch in enumerate(patch_handles[j].get_children()):
#         print(i)
        bl = patch.get_xy()
        x = 0.5*patch.get_width() + bl[0]
        y = 0.5*patch.get_height() + bl[1]
        ax.text(x,y, "%d%%" % (percentages[i,j]), ha='center')

ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.set_xlabel('Scores')
plt.show()

python dataframe如何分组并求相关系数 python dataframe分组求和_python_09

import matplotlib.pyplot as plt

graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
              'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for x in graph_name:
    graph_data = new_train_9.groupBy(x, 'label').agg({'label': 'count'}).toPandas()
    graph_data = pd.pivot_table(graph_data, values='count(label)', index=[x],columns=['label'], aggfunc=np.sum, fill_value=0)
    print(graph_data.head())
    
    # Create a figure with a single subplot
    f, ax = plt.subplots(1, figsize=((16, 10)))
    # Create the total ad counts for each participant
    totals = [i+j for i,j in zip(graph_data[0], graph_data[1])]
    # Create the percentage of the total ad counts the ad-colicks counts value for each participant was
    unclick = [i / j * 100 for  i,j in zip(graph_data[0], totals)]
    click = [i / j * 100 for  i,j in zip(graph_data[1], totals)]
    ax.bar(graph_data.index, 
       # using unclick(0) data
       unclick, 
       # labeled 
       label='0', 
       # with border color
       edgecolor='white'
       )
    ax.bar(graph_data.index, 
       # using unclick(0) data
       click, 
       # labeled 
       label='1',
       # with unclick
       bottom=unclick,    
       # with border color
       edgecolor='white'
       )
    ax.set_ylabel("Percentage")
    ax.set_xlabel(x)
    plt.show()

python dataframe如何分组并求相关系数 python dataframe分组求和_柱状图_10

import matplotlib.pyplot as plt

graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
              'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for name in graph_name:
    graph_data = new_train_9.groupBy(name, 'label').agg({'label': 'count'}).toPandas()
    graph_data = pd.pivot_table(graph_data, values='count(label)', index=[name],columns=['label'], aggfunc=np.sum, fill_value=0)
    print(graph_data.head())
    
    # Create a figure with a single subplot
    f, ax = plt.subplots(1, figsize=((16, 10)))
    # Create the total ad counts for each participant
    totals = [i+j for i,j in zip(graph_data[0], graph_data[1])]
    # Create the percentage of the total ad counts the ad-colicks counts value for each participant was
    unclick = [i / j * 100 for  i,j in zip(graph_data[0], totals)]
    click = [i / j * 100 for  i,j in zip(graph_data[1], totals)]
    
    
    patch_handles = []

    patch_handles.append(ax.bar(graph_data.index, 
                               # using unclick(0) data
                               unclick, 
                               # labeled 
                               label='0', 
                               # with border color
                               edgecolor='white'))
    patch_handles.append(ax.bar(graph_data.index, 
                               # using unclick(0) data
                               click, 
                               # labeled 
                               label='1',
                               # with unclick
                               bottom=unclick,    
                               # with border color
                               edgecolor='white'
                               ))
    percentages = []
    percentages.append(unclick)
    percentages.append(click)

    # search all of the bar segments and annotate
    for j in range(len(patch_handles)):
        for i, patch in enumerate(patch_handles[j].get_children()):
            bl = patch.get_xy()
            x = 0.5*patch.get_width() + bl[0]
            y = 0.5*patch.get_height() + bl[1]
            ax.text(x,y, "%d%%" % (percentages[j][i]), ha='center')
    
    ax.set_ylabel("Percentage")
    ax.set_xlabel(name)
    plt.show()

python dataframe如何分组并求相关系数 python dataframe分组求和_数据分析_11