python 等深分箱等宽分箱结合二分箱的数据分析

原创

Shen Liang 2023-02-21 09:31:43 博主文章分类：机器学习 ©著作权

文章标签 等深分箱等宽分箱二分类 python 数据 文章分类 JavaScript 前端开发

©著作权归作者所有：来自51CTO博客作者Shen Liang的原创作品，请联系作者获取转载授权，否则将追究法律责任

python 等深分箱等宽分箱结合二分箱的数据分析

等深分箱等宽分箱概述

Python里可以通过pcut（等深分箱即每箱的样本量基本一致）和cut（等宽分箱即样本量之间有相同的宽度）对样本进行分箱。

详见如下代码部分。本文的数据来自网络，部分代码也有所参照，这里做了注释和延伸，旨在技术交流，如有冒犯之处请联系博主及时处理。

代码演示

#coding:utf-8
import datetime
import pandas as pd
def RFM():

    trad_flow = pd.read_csv(r'../input/RFM_TRAD_FLOW.csv',encoding="GBK")
    trad_flow_new = trad_flow.copy()
    ##trad_flow_new['time_new']=trad_flow_new['time'].apply(lambda x :timeFormat(x))
    trad_flow_new['time_format'] = trad_flow_new.time.apply(timeFormat)
    ##pd.set_option('display.max_rows', 9)
    pd.set_option('precision', 2) #小数点保留位
    pd.set_option('display.max_columns', 18) #最大列数
    pd.set_option('expand_frame_repr', False) #不换行显示
    pd.set_option('display.width', 200)#横向最多显示的字符数
    ##print(trad_flow_new.head(10))
    ##对用户按照列cumid、type分组,列transID统计数量
    F = trad_flow_new.groupby(['cumid','type'])[['transID']].count()
    ##print(F.head())
    ##对DataFrame F按照type、transID进行pivot，即行列转换
    F_trans = pd.pivot_table(F, index='cumid', columns='type', values='transID')
    ##print(F_trans.head())
    # 计算不包含Normal的数据
    ## 查看returned_goods、Normal 列是否有空值,如果有则用0代替。shape[0]是行数，shape[1]是列数
    ##print(F_trans[F_trans.returned_goods.isnull()].shape[0])#shape[1]
    ## 仅仅对数据列里有NULL的用0来填充
    ##F_trans=F_trans[F_trans.isnull().T.any()].fillna(0)
    F_trans = F_trans.fillna(0)
    ##F_trans['Special_offer'] = F_trans['Special_offer'].fillna(0) ##单独对列FillNa
    ## 计算兴趣比,这里是 特价/(特价+正常)
    F_trans['interest'] = F_trans.Special_offer/(F_trans.Special_offer+F_trans.Normal)
    ##print(F_trans.head())
    #print(trad_flow_new[trad_flow_new['cumid']==19021]) #查看cumid等于19021的明细
    ##顾客价值信息取向
    M = trad_flow_new.groupby(['cumid','type'])[['amount']].sum()
    M_trans = pd.pivot_table(M,index='cumid',columns='type',values='amount')
    M_trans = M_trans.fillna(0)
    M_trans['value']=M_trans.Normal + M_trans.Special_offer+M_trans.returned_goods
    ##print(M_trans.head(10))
    trad_flow_new['time_new']=trad_flow_new.time.apply(to_time)
    R = trad_flow_new.groupby(['cumid'])[['time_new']].max()
    ##print(R.head())
    from sklearn import preprocessing
    #这里是等深分箱q即quantile,这里参数是分成两箱。
    threshold = pd.qcut(F_trans['interest'], 2, retbins=True)[1][1]

    print("F 值二分类右边界:\t"+str(threshold))


    #对F dataframe做二分类转换
    binarizer = preprocessing.Binarizer(threshold=threshold) ## 定义二分类转换器
    single_row=F_trans['interest'].values.reshape(-1,1) ## 提取interest列到ndarray中，这里-1是指行数未知
    b_f_interest = pd.DataFrame(binarizer.transform(single_row)) ##通过二分类转换器转换单列并生成DataFrame
    b_f_interest.index = F_trans.index
    b_f_interest.columns=['interest']

    #对M dataframe的二分类转换
    threshold = pd.qcut(M_trans['value'], 2, retbins=True)[1][1]
    print("M 值二分类右边界:\t" + str(threshold))
    binarizer = preprocessing.Binarizer(threshold=threshold)
    single_row = M_trans['value'].values.reshape(-1,1)
    b_m_value = pd.DataFrame(binarizer.transform(single_row))
    b_m_value.index = M_trans.index
    b_m_value.columns=['value']
    ##print(b_m_value.head())

    #对R dataframe的二分类转换
    threshold = pd.qcut(R['time_new'],2,retbins=True)[1][1]
    print("R 值二分类右边界:\t" + str(threshold))
    binarizer = preprocessing.Binarizer(threshold=threshold)
    single_row = R.time_new.values.reshape(-1,1)
    b_r_time= pd.DataFrame(binarizer.transform(single_row))
    b_r_time.index = R.index
    b_r_time.columns=['time']
    total = pd.concat([b_f_interest,b_m_value,b_r_time],axis=1)
    ##print(total.head())
    ##定义标签，按照FMR 2*2*2 = 8种情况定义出客户标签
    label = {
        (0, 0, 0): '无兴趣-低价值-沉默',
        (1, 0, 0): '有兴趣-低价值-沉默',
        (1, 0, 1): '有兴趣-低价值-活跃',
        (0, 0, 1): '无兴趣-低价值-活跃',
        (0, 1, 0): '无兴趣-高价值-沉默',
        (1, 1, 0): '有兴趣-高价值-沉默',
        (1, 1, 1): '有兴趣-高价值-活跃',
        (0, 1, 1): '无兴趣-高价值-活跃'
    }

    total['label'] = total[['interest','value','time']].apply(lambda x:label[((x[0],x[1],x[2]))], axis = 1)
    print(total.head())

def timeFormat(Str):
    import datetime
    return datetime.datetime.strptime(Str,'%d%b%y:%H:%M:%S')

def to_time(t):
    import time
    out_t=time.mktime(time.strptime(t, '%d%b%y:%H:%M:%S'))   ##转换为数字类型(即与1970-1-1 8:00:00的秒数差值)方便后面qcut分箱
    return out_t

def boxsplit():
    F_x = pd.DataFrame(data={'age': [18, 19, 23, 25, 27, 29,34,45]})
    ''' Method 1 等宽分箱,宽度 w= (Max-Min)/N 即 (45-18)/3= 9，则每个分箱的右边界为 Min+(N-1)*w
    第一个分箱的边界是 18+9*1=27、第二个分箱的边界是 18+9*2=36、第三个分箱的边界是 18+9*3=45
    '''
    ##print(pd.cut(F_x['age'], 3))
    print(pd.cut(F_x['age'], 3).value_counts())

    '''  Method 2 等深分箱(等频分箱),先通过分位数计算各个分位数对应的临界值，再"等"分数据
    '''
    print(F_x.age.quantile([0, 1 / 3, 2 / 3, 1]))
    #print(pd.qcut(F_x['age'], 3, retbins=True))
    print(pd.qcut(F_x['age'], 3).value_counts())

    '''通过观察不难发现等深分箱每组的数量基本一致 而等宽分箱每组的数量可能差距较大'''

    '''卡方合并法(待补充)'''
def baseTime():
    import time
    t = (1970, 1, 1, 8, 0, 0, 3, 1, 0)
    secs = time.mktime(t)
    print("time.mktime(t) : %f" % secs)
    print("asctime(localtime(secs)): %s" % time.asctime(time.localtime(secs)))




if __name__ == '__main__':
    #print(timeFormat('14MAY10:13:31:23'))
    RFM()
    #a="14JUN09:17:58:34"
    #print(to_time(a))
    boxsplit()
    baseTime()

执行结果：

"F:\Program Files\Python37\python.exe" E:/DevData/GiteePython/com/shenl/ml/RFM/RFM.py
F 值二分类右边界:   0.08333333333333333
M 值二分类右边界:   2944.5
R 值二分类右边界:   1284373750.0
interest value time label
cumid
10001 1.0 1.0 1.0 有兴趣-高价值-活跃
10002 0.0 0.0 0.0 无兴趣-低价值-沉默
10003 0.0 1.0 0.0 无兴趣-高价值-沉默
10004 1.0 1.0 0.0 有兴趣-高价值-沉默
10005 0.0 0.0 0.0 无兴趣-低价值-沉默
(17.973, 27.0] 5
(27.0, 36.0] 2
(36.0, 45.0] 1
Name: age, dtype: int64
0.00 18.00
0.33 23.67
0.67 28.33
1.00 45.00
Name: age, dtype: float64
(28.333, 45.0] 3
(17.999, 23.667] 3
(23.667, 28.333] 2
Name: age, dtype: int64
time.mktime(t) : 0.000000
asctime(localtime(secs)): Thu Jan 1 08:00:00 1970