通过Python代码封装评分卡设计中经常使用的方法
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
# EDA分析
# 类别型变量的分布
def plot_cate_var(df, col_list, hspace=0.4, wspace=0.4, plt_size=None, plt_num=None, x=None, y=None):
"""
df:数据集
col_list:变量list集合
hspace :子图之间的间隔(y轴方向)
wspace :子图之间的间隔(x轴方向)
plt_size :图纸的尺寸
plt_num :子图的数量
x :子图矩阵中一行子图的数量
y :子图矩阵中一列子图的数量
return :变量的分布图(柱状图形式)
"""
plt.figure(figsize=plt_size)
plt.subplots_adjust(hspace=hspace, wspace=wspace)
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
for i, col in zip(range(1, plt_num + 1, 1), col_list):
plt.subplot(x, y, i)
plt.title(col)
sns.countplot(data=df, y=col)
plt.ylabel('')
return plt.show()
# 数值型变量的分布
def plot_num_col(df, col_list, hspace=0.4, wspace=0.4, plt_type=None, plt_size=None, plt_num=None, x=None, y=None):
"""
df:数据集
col_list:变量list集合
hspace :子图之间的间隔(y轴方向)
wspace :子图之间的间隔(x轴方向)
plt_type: 选择直方图/箱线图
plt_size :图纸的尺寸
plt_num :子图的数量
x :子图矩阵中一行子图的数量
y :子图矩阵中一列子图的数量
return :变量的分布图(箱线图/直方图)
"""
plt.figure(figsize=plt_size)
plt.subplots_adjust(hspace=hspace, wspace=wspace)
if plt_type == 'hist':
for i, col in zip(range(1, plt_num + 1, 1), col_list):
plt.subplot(x, y, i)
plt.title(col)
sns.distplot(df[col].dropna())
plt.xlabel('')
if plt_type == 'box':
for i, col in zip(range(1, plt_num + 1, 1), col_list):
plt.subplot(x, y, i)
plt.title(col)
sns.boxplot(data=df, x=col)
plt.xlabel('')
return plt.show()
# 类别型变量的违约率分析
def plot_default_cate(df, col_list, target, hspace=0.4, wspace=0.4, plt_size=None, plt_num=None, x=None, y=None):
"""
df:数据集
col_list:变量list集合
target :目标变量的字段名
hspace :子图之间的间隔(y轴方向)
wspace :子图之间的间隔(x轴方向)
plt_size :图纸的尺寸
plt_num :子图的数量
x :子图矩阵中一行子图的数量
y :子图矩阵中一列子图的数量
return :违约率分布图(柱状图形式)
"""
all_bad = df[target].sum()
total = df[target].count()
all_default_rate = all_bad * 1.0 / total
plt.figure(figsize=plt_size)
plt.subplots_adjust(hspace=hspace, wspace=wspace)
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
for i, col in zip(range(1, plt_num + 1, 1), col_list):
d1 = df.groupby(col)
d2 = pd.DataFrame()
d2['total'] = d1[target].count()
d2['bad'] = d1[target].sum()
d2['default_rate'] = d2['bad'] / d2['total']
d2 = d2.reset_index()
plt.subplot(x, y, i)
plt.title(col)
plt.axvline(x=all_default_rate)
sns.barplot(data=d2, y=col, x='default_rate')
plt.ylabel('')
return plt.show()
# 数值型变量的违约率分析
def plot_default_num(df, col_list, target, hspace=0.4, wspace=0.4, q=None, plt_size=None, plt_num=None, x=None, y=None):
"""
df:数据集
col_list:变量list集合
target :目标变量的字段名
hspace :子图之间的间隔(y轴方向)
wspace :子图之间的间隔(x轴方向)
q :等深分箱的箱体个数
plt_size :图纸的尺寸
plt_num :子图的数量
x :子图矩阵中一行子图的数量
y :子图矩阵中一列子图的数量
return :违约率分布图(折线图形式)
"""
all_bad = df[target].sum()
total = df[target].count()
all_default_rate = all_bad * 1.0 / total
plt.figure(figsize=plt_size)
plt.subplots_adjust(hspace=hspace, wspace=wspace)
for i, col in zip(range(1, plt_num + 1, 1), col_list):
bucket = pd.qcut(df[col], q=q, duplicates='drop')
d1 = df.groupby(bucket)
d2 = pd.DataFrame()
d2['total'] = d1[target].count()
d2['bad'] = d1[target].sum()
d2['default_rate'] = d2['bad'] / d2['total']
d2 = d2.reset_index()
plt.subplot(x, y, i)
plt.title(col)
plt.axhline(y=all_default_rate)
sns.pointplot(data=d2, x=col, y='default_rate', color='hotpink')
plt.xticks(rotation=60)
plt.xlabel('')
return plt.show()
# 变量woe离散化
# 变量woe结果表
def woe_df_concat(bin_df):
"""
bin_df:list形式,里面存储每个变量的分箱结果
return :woe结果表
"""
woe_df_list = []
for df in bin_df:
woe_df = df.reset_index().assign(col=df.index.name).rename(columns={df.index.name: 'bin'})
woe_df_list.append(woe_df)
woe_result = pd.concat(woe_df_list, axis=0)
# 为了便于查看,将字段名列移到第一列的位置上
woe_result1 = woe_result['col']
woe_result2 = woe_result.iloc[:, :-1]
woe_result_df = pd.concat([woe_result1, woe_result2], axis=1)
woe_result_df = woe_result_df.reset_index(drop=True)
return woe_result_df
# woe转换
def woe_transform(df, target, df_woe):
"""
df:数据集
target:目标变量的字段名
df_woe:woe结果表
return:woe转化之后的数据集
"""
df2 = df.copy()
for col in df2.drop([target], axis=1).columns:
x = df2[col]
bin_map = df_woe[df_woe.col == col]
bin_res = np.array([0] * x.shape[0], dtype=float)
for i in bin_map.index:
lower = bin_map['min_bin'][i]
upper = bin_map['max_bin'][i]
if lower == upper:
x1 = x[np.where(x == lower)[0]]
else:
x1 = x[np.where((x >= lower) & (x <= upper))[0]]
mask = np.in1d(x, x1)
bin_res[mask] = bin_map['woe'][i]
bin_res = pd.Series(bin_res, index=x.index)
bin_res.name = x.name
df2[col] = bin_res
return df2
# 变量分箱
# 类别性变量的分箱
def binning_cate(df, col_list, target):
"""
df:数据集
col_list:变量list集合
target:目标变量的字段名
return:
bin_df :list形式,里面存储每个变量的分箱结果
iv_value:list形式,里面存储每个变量的IV值
"""
total = df[target].count()
bad = df[target].sum()
good = total - bad
all_odds = good * 1.0 / bad
bin_df = []
iv_value = []
for col in col_list:
d1 = df.groupby([col], as_index=True)
d2 = pd.DataFrame()
d2['min_bin'] = d1[col].min()
d2['max_bin'] = d1[col].max()
d2['total'] = d1[target].count()
d2['totalrate'] = d2['total'] / total
d2['bad'] = d1[target].sum()
d2['badrate'] = d2['bad'] / d2['total']
d2['good'] = d2['total'] - d2['bad']
d2['goodrate'] = d2['good'] / d2['total']
d2['badattr'] = d2['bad'] / bad
d2['goodattr'] = (d2['total'] - d2['bad']) / good
d2['odds'] = d2['good'] / d2['bad']
GB_list = []
for i in d2.odds:
if i >= all_odds:
GB_index = str(round((i / all_odds) * 100, 0)) + str('G')
else:
GB_index = str(round((all_odds / i) * 100, 0)) + str('B')
GB_list.append(GB_index)
d2['GB_index'] = GB_list
d2['woe'] = np.log(d2['badattr'] / d2['goodattr'])
d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['woe']
d2['IV'] = d2['bin_iv'].sum()
iv = d2['bin_iv'].sum().round(3)
print('变量名:{}'.format(col))
print('IV:{}'.format(iv))
print('\t')
bin_df.append(d2)
iv_value.append(iv)
return bin_df, iv_value
# 类别性变量iv的明细表
def iv_cate(df, col_list, target):
"""
df:数据集
col_list:变量list集合
target:目标变量的字段名
return:变量的iv明细表
"""
bin_df, iv_value = binning_cate(df, col_list, target)
iv_df = pd.DataFrame({'col': col_list,
'iv': iv_value})
iv_df = iv_df.sort_values('iv', ascending=False)
return iv_df
# 数值型变量的分箱
# 先用卡方分箱输出变量的分割点
def split_data(df, col, split_num):
"""
df: 原始数据集
col:需要分箱的变量
split_num:分割点的数量
"""
df2 = df.copy()
count = df2.shape[0] # 总样本数
n = math.floor(count / split_num) # 按照分割点数目等分后每组的样本数
split_index = [i * n for i in range(1, split_num)] # 分割点的索引
values = sorted(list(df2[col])) # 对变量的值从小到大进行排序
split_value = [values[i] for i in split_index] # 分割点对应的value
split_value = sorted(list(set(split_value))) # 分割点的value去重排序
return split_value
def assign_group(x, split_bin):
"""
x:变量的value
split_bin:split_data得出的分割点list
"""
n = len(split_bin)
if x <= min(split_bin):
return min(split_bin) # 如果x小于分割点的最小值,则x映射为分割点的最小值
elif x > max(split_bin): # 如果x大于分割点的最大值,则x映射为分割点的最大值
return 10e10
else:
for i in range(n - 1):
if split_bin[i] < x <= split_bin[i + 1]: # 如果x在两个分割点之间,则x映射为分割点较大的值
return split_bin[i + 1]
def bin_bad_rate(df, col, target, grantRateIndicator=0):
"""
df:原始数据集
col:原始变量/变量映射后的字段
target:目标变量的字段
grantRateIndicator:是否输出总体的违约率
"""
total = df.groupby([col])[target].count()
bad = df.groupby([col])[target].sum()
total_df = pd.DataFrame({'total': total})
bad_df = pd.DataFrame({'bad': bad})
regroup = pd.merge(total_df, bad_df, left_index=True, right_index=True, how='left')
regroup = regroup.reset_index()
regroup['bad_rate'] = regroup['bad'] / regroup['total'] # 计算根据col分组后每组的违约率
dict_bad = dict(zip(regroup[col], regroup['bad_rate'])) # 转为字典形式
if grantRateIndicator == 0:
return (dict_bad, regroup)
total_all = df.shape[0]
bad_all = df[target].sum()
all_bad_rate = bad_all / total_all # 计算总体的违约率
return (dict_bad, regroup, all_bad_rate)
def cal_chi2(df, all_bad_rate):
"""
df:bin_bad_rate得出的regroup
all_bad_rate:bin_bad_rate得出的总体违约率
"""
df2 = df.copy()
df2['expected'] = df2['total'] * all_bad_rate # 计算每组的坏用户期望数量
combined = zip(df2['expected'], df2['bad']) # 遍历每组的坏用户期望数量和实际数量
chi = [(i[0] - i[1]) ** 2 / i[0] for i in combined] # 计算每组的卡方值
chi2 = sum(chi) # 计算总的卡方值
return chi2
def assign_bin(x, cutoffpoints):
"""
x:变量的value
cutoffpoints:分箱的切割点
"""
bin_num = len(cutoffpoints) + 1 # 箱体个数
if x <= cutoffpoints[0]: # 如果x小于最小的cutoff点,则映射为Bin 0
return 'Bin 0'
elif x > cutoffpoints[-1]: # 如果x大于最大的cutoff点,则映射为Bin(bin_num-1)
return 'Bin {}'.format(bin_num - 1)
else:
for i in range(0, bin_num - 1):
if cutoffpoints[i] < x <= cutoffpoints[i + 1]: # 如果x在两个cutoff点之间,则x映射为Bin(i+1)
return 'Bin {}'.format(i + 1)
def ChiMerge(df, col, target, max_bin=5, min_binpct=0):
col_unique = sorted(list(set(df[col]))) # 变量的唯一值并排序
n = len(col_unique) # 变量唯一值得个数
df2 = df.copy()
if n > 100: # 如果变量的唯一值数目超过100,则将通过split_data和assign_group将x映射为split对应的value
split_col = split_data(df2, col, 100) # 通过这个目的将变量的唯一值数目人为设定为100
df2['col_map'] = df2[col].map(lambda x: assign_group(x, split_col))
else:
df2['col_map'] = df2[col] # 变量的唯一值数目没有超过100,则不用做映射
# 生成dict_bad,regroup,all_bad_rate的元组
(dict_bad, regroup, all_bad_rate) = bin_bad_rate(df2, 'col_map', target, grantRateIndicator=1)
col_map_unique = sorted(list(set(df2['col_map']))) # 对变量映射后的value进行去重排序
group_interval = [[i] for i in col_map_unique] # 对col_map_unique中每个值创建list并存储在group_interval中
while (len(group_interval) > max_bin): # 当group_interval的长度大于max_bin时,执行while循环
chi_list = []
for i in range(len(group_interval) - 1):
temp_group = group_interval[i] + group_interval[i + 1] # temp_group 为生成的区间,list形式,例如[1,3]
chi_df = regroup[regroup['col_map'].isin(temp_group)]
chi_value = cal_chi2(chi_df, all_bad_rate) # 计算每一对相邻区间的卡方值
chi_list.append(chi_value)
best_combined = chi_list.index(min(chi_list)) # 最小的卡方值的索引
# 将卡方值最小的一对区间进行合并
group_interval[best_combined] = group_interval[best_combined] + group_interval[best_combined + 1]
# 删除合并前的右区间
group_interval.remove(group_interval[best_combined + 1])
# 对合并后每个区间进行排序
group_interval = [sorted(i) for i in group_interval]
# cutoff点为每个区间的最大值
cutoffpoints = [max(i) for i in group_interval[:-1]]
# 检查是否有箱只有好样本或者只有坏样本
df2['col_map_bin'] = df2['col_map'].apply(lambda x: assign_bin(x, cutoffpoints)) # 将col_map映射为对应的区间Bin
# 计算每个区间的违约率
(dict_bad, regroup) = bin_bad_rate(df2, 'col_map_bin', target)
# 计算最小和最大的违约率
[min_bad_rate, max_bad_rate] = [min(dict_bad.values()), max(dict_bad.values())]
# 当最小的违约率等于0,说明区间内只有好样本,当最大的违约率等于1,说明区间内只有坏样本
while min_bad_rate == 0 or max_bad_rate == 1:
bad01_index = regroup[regroup['bad_rate'].isin([0, 1])].col_map_bin.tolist() # 违约率为1或0的区间
bad01_bin = bad01_index[0]
if bad01_bin == max(regroup.col_map_bin):
cutoffpoints = cutoffpoints[:-1] # 当bad01_bin是最大的区间时,删除最大的cutoff点
elif bad01_bin == min(regroup.col_map_bin):
cutoffpoints = cutoffpoints[1:] # 当bad01_bin是最小的区间时,删除最小的cutoff点
else:
bad01_bin_index = list(regroup.col_map_bin).index(bad01_bin) # 找出bad01_bin的索引
prev_bin = list(regroup.col_map_bin)[bad01_bin_index - 1] # bad01_bin前一个区间
df3 = df2[df2.col_map_bin.isin([prev_bin, bad01_bin])]
(dict_bad, regroup1) = bin_bad_rate(df3, 'col_map_bin', target)
chi1 = cal_chi2(regroup1, all_bad_rate) # 计算前一个区间和bad01_bin的卡方值
later_bin = list(regroup.col_map_bin)[bad01_bin_index + 1] # bin01_bin的后一个区间
df4 = df2[df2.col_map_bin.isin([later_bin, bad01_bin])]
(dict_bad, regroup2) = bin_bad_rate(df4, 'col_map_bin', target)
chi2 = cal_chi2(regroup2, all_bad_rate) # 计算后一个区间和bad01_bin的卡方值
if chi1 < chi2: # 当chi1<chi2时,删除前一个区间对应的cutoff点
cutoffpoints.remove(cutoffpoints[bad01_bin_index - 1])
else: # 当chi1>=chi2时,删除bin01对应的cutoff点
cutoffpoints.remove(cutoffpoints[bad01_bin_index])
df2['col_map_bin'] = df2['col_map'].apply(lambda x: assign_bin(x, cutoffpoints))
(dict_bad, regroup) = bin_bad_rate(df2, 'col_map_bin', target)
# 重新将col_map映射至区间,并计算最小和最大的违约率,直达不再出现违约率为0或1的情况,循环停止
[min_bad_rate, max_bad_rate] = [min(dict_bad.values()), max(dict_bad.values())]
# 检查分箱后的最小占比
if min_binpct > 0:
group_values = df2['col_map'].apply(lambda x: assign_bin(x, cutoffpoints))
df2['col_map_bin'] = group_values # 将col_map映射为对应的区间Bin
group_df = group_values.value_counts().to_frame()
group_df['bin_pct'] = group_df['col_map'] / n # 计算每个区间的占比
min_pct = group_df.bin_pct.min() # 得出最小的区间占比
while min_pct < min_binpct and len(cutoffpoints) > 2: # 当最小的区间占比小于min_pct且cutoff点的个数大于2,执行循环
# 下面的逻辑基本与“检验是否有箱体只有好/坏样本”的一致
min_pct_index = group_df[group_df.bin_pct == min_pct].index.tolist()
min_pct_bin = min_pct_index[0]
if min_pct_bin == max(group_df.index):
cutoffpoints = cutoffpoints[:-1]
elif min_pct_bin == min(group_df.index):
cutoffpoints = cutoffpoints[1:]
else:
minpct_bin_index = list(group_df.index).index(min_pct_bin)
prev_pct_bin = list(group_df.index)[minpct_bin_index - 1]
df5 = df2[df2['col_map_bin'].isin([min_pct_bin, prev_pct_bin])]
(dict_bad, regroup3) = bin_bad_rate(df5, 'col_map_bin', target)
chi3 = cal_chi2(regroup3, all_bad_rate)
later_pct_bin = list(group_df.index)[minpct_bin_index + 1]
df6 = df2[df2['col_map_bin'].isin([min_pct_bin, later_pct_bin])]
(dict_bad, regroup4) = bin_bad_rate(df6, 'col_map_bin', target)
chi4 = cal_chi2(regroup4, all_bad_rate)
if chi3 < chi4:
cutoffpoints.remove(cutoffpoints[minpct_bin_index - 1])
else:
cutoffpoints.remove(cutoffpoints[minpct_bin_index])
return cutoffpoints
# 数值型变量的分箱(卡方分箱)
def binning_num(df, target, col_list, max_bin=None, min_binpct=None):
"""
df:数据集
target:目标变量的字段名
col_list:变量list集合
max_bin:最大的分箱个数
min_binpct:区间内样本所占总体的最小比
return:
bin_df :list形式,里面存储每个变量的分箱结果
iv_value:list形式,里面存储每个变量的IV值
"""
total = df[target].count()
bad = df[target].sum()
good = total - bad
all_odds = good / bad
inf = float('inf')
ninf = float('-inf')
bin_df = []
iv_value = []
for col in col_list:
cut = ChiMerge(df, col, target, max_bin=max_bin, min_binpct=min_binpct)
cut.insert(0, ninf)
cut.append(inf)
bucket = pd.cut(df[col], cut)
d1 = df.groupby(bucket)
d2 = pd.DataFrame()
d2['min_bin'] = d1[col].min()
d2['max_bin'] = d1[col].max()
d2['total'] = d1[target].count()
d2['totalrate'] = d2['total'] / total
d2['bad'] = d1[target].sum()
d2['badrate'] = d2['bad'] / d2['total']
d2['good'] = d2['total'] - d2['bad']
d2['goodrate'] = d2['good'] / d2['total']
d2['badattr'] = d2['bad'] / bad
d2['goodattr'] = (d2['total'] - d2['bad']) / good
d2['odds'] = d2['good'] / d2['bad']
GB_list = []
for i in d2.odds:
if i >= all_odds:
GB_index = str(round((i / all_odds) * 100, 0)) + str('G')
else:
GB_index = str(round((all_odds / i) * 100, 0)) + str('B')
GB_list.append(GB_index)
d2['GB_index'] = GB_list
d2['woe'] = np.log(d2['badattr'] / d2['goodattr'])
d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['woe']
d2['IV'] = d2['bin_iv'].sum()
iv = d2['bin_iv'].sum().round(3)
print('变量名:{}'.format(col))
print('IV:{}'.format(iv))
print('\t')
bin_df.append(d2)
iv_value.append(iv)
return bin_df, iv_value
# 数值型变量的iv明细表
def iv_num(df, target, col_list, max_bin=None, min_binpct=None):
"""
df:数据集
target:目标变量的字段名
col_list:变量list集合
max_bin:最大的分箱个数
min_binpct:区间内样本所占总体的最小比
return :变量的iv明细表
"""
bin_df, iv_value = binning_num(df, target, col_list, max_bin=max_bin, min_binpct=min_binpct)
iv_df = pd.DataFrame({'col': col_list,
'iv': iv_value})
iv_df = iv_df.sort_values('iv', ascending=False)
return iv_df
# 自定义分箱
def binning_self(df, col, target, cut=None, right_border=True):
"""
df: 数据集
col:分箱的单个变量名
cut:划分区间的list
right_border:设定左开右闭、左闭右开
return:
bin_df: df形式,单个变量的分箱结果
iv_value: 单个变量的iv
"""
total = df[target].count()
bad = df[target].sum()
good = total - bad
all_odds = good / bad
bucket = pd.cut(df[col], cut, right=right_border)
d1 = df.groupby(bucket)
d2 = pd.DataFrame()
d2['min_bin'] = d1[col].min()
d2['max_bin'] = d1[col].max()
d2['total'] = d1[target].count()
d2['totalrate'] = d2['total'] / total
d2['bad'] = d1[target].sum()
d2['badrate'] = d2['bad'] / d2['total']
d2['good'] = d2['total'] - d2['bad']
d2['goodrate'] = d2['good'] / d2['total']
d2['badattr'] = d2['bad'] / bad
d2['goodattr'] = (d2['total'] - d2['bad']) / good
d2['odds'] = d2['good'] / d2['bad']
GB_list = []
for i in d2.odds:
if i >= all_odds:
GB_index = str(round((i / all_odds) * 100, 0)) + str('G')
else:
GB_index = str(round((all_odds / i) * 100, 0)) + str('B')
GB_list.append(GB_index)
d2['GB_index'] = GB_list
d2['woe'] = np.log(d2['badattr'] / d2['goodattr'])
d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['woe']
d2['IV'] = d2['bin_iv'].sum()
iv_value = d2['bin_iv'].sum().round(3)
print('变量名:{}'.format(col))
print('IV:{}'.format(iv_value))
bin_df = d2.copy()
return bin_df, iv_value
# 变量分箱结果的检查
# woe的可视化
def plot_woe(bin_df, hspace=0.4, wspace=0.4, plt_size=None, plt_num=None, x=None, y=None):
"""
bin_df:list形式,里面存储每个变量的分箱结果
hspace :子图之间的间隔(y轴方向)
wspace :子图之间的间隔(x轴方向)
plt_size :图纸的尺寸
plt_num :子图的数量
x :子图矩阵中一行子图的数量
y :子图矩阵中一列子图的数量
return :每个变量的woe变化趋势图
"""
plt.figure(figsize=plt_size)
plt.subplots_adjust(hspace=hspace, wspace=wspace)
for i, df in zip(range(1, plt_num + 1, 1), bin_df):
col_name = df.index.name
df = df.reset_index()
plt.subplot(x, y, i)
plt.title(col_name)
sns.barplot(data=df, x=col_name, y='woe')
plt.xlabel('')
plt.xticks(rotation=30)
return plt.show()
# 检验woe是否单调
def woe_monoton(bin_df):
"""
bin_df:list形式,里面存储每个变量的分箱结果
return :
woe_notmonoton_col :woe没有呈单调变化的变量,list形式
woe_judge_df :df形式,每个变量的检验结果
"""
woe_notmonoton_col = []
col_list = []
woe_judge = []
for woe_df in bin_df:
col_name = woe_df.index.name
woe_list = list(woe_df.woe)
if woe_df.shape[0] == 2:
# print('{}是否单调: True'.format(col_name))
col_list.append(col_name)
woe_judge.append('True')
else:
woe_not_monoton = [(woe_list[i] < woe_list[i + 1] and woe_list[i] < woe_list[i - 1]) or (
woe_list[i] > woe_list[i + 1] and woe_list[i] > woe_list[i - 1]) for i in
range(1, len(woe_list) - 1, 1)]
if True in woe_not_monoton:
# print('{}是否单调: False'.format(col_name))
woe_notmonoton_col.append(col_name)
col_list.append(col_name)
woe_judge.append('False')
else:
# print('{}是否单调: True'.format(col_name))
col_list.append(col_name)
woe_judge.append('True')
woe_judge_df = pd.DataFrame({'col': col_list,
'judge_monoton': woe_judge})
return woe_notmonoton_col, woe_judge_df
# 检查某个区间的woe是否大于1
def woe_large(bin_df):
"""
bin_df:list形式,里面存储每个变量的分箱结果
return:
woe_large_col: 某个区间woe大于1的变量,list集合
woe_judge_df :df形式,每个变量的检验结果
"""
woe_large_col = []
col_list = []
woe_judge = []
for woe_df in bin_df:
col_name = woe_df.index.name
woe_list = list(woe_df.woe)
woe_large = list(filter(lambda x: x >= 1, woe_list))
if len(woe_large) > 0:
col_list.append(col_name)
woe_judge.append('True')
woe_large_col.append(col_name)
else:
col_list.append(col_name)
woe_judge.append('False')
woe_judge_df = pd.DataFrame({'col': col_list,
'judge_large': woe_judge})
return woe_large_col, woe_judge_df
# 变量筛选
# xgboost筛选变量
def select_xgboost(df, target, imp_num=None):
"""
df:数据集
target:目标变量的字段名
imp_num:筛选变量的个数
return:
xg_fea_imp:变量的特征重要性
xg_select_col:筛选出的变量
"""
x = df.drop([target], axis=1)
y = df[target]
xgmodel = XGBClassifier(random_state=0)
xgmodel = xgmodel.fit(x, y, eval_metric='auc')
xg_fea_imp = pd.DataFrame({'col': list(x.columns),
'imp': xgmodel.feature_importances_})
xg_fea_imp = xg_fea_imp.sort_values('imp', ascending=False).reset_index(drop=True).iloc[:imp_num, :]
xg_select_col = list(xg_fea_imp.col)
return xg_fea_imp, xg_select_col
# 随机森林筛选变量
def select_rf(df, target, imp_num=None):
"""
df:数据集
target:目标变量的字段名
imp_num:筛选变量的个数
return:
rf_fea_imp:变量的特征重要性
rf_select_col:筛选出的变量
"""
x = df.drop([target], axis=1)
y = df[target]
rfmodel = RandomForestClassifier(random_state=0)
rfmodel = rfmodel.fit(x, y)
rf_fea_imp = pd.DataFrame({'col': list(x.columns),
'imp': rfmodel.feature_importances_})
rf_fea_imp = rf_fea_imp.sort_values('imp', ascending=False).reset_index(drop=True).iloc[:imp_num, :]
rf_select_col = list(rf_fea_imp.col)
return rf_fea_imp, rf_select_col
# 相关性可视化
def plot_corr(df, col_list, threshold=None, plt_size=None, is_annot=True):
"""
df:数据集
col_list:变量list集合
threshold: 相关性设定的阈值
plt_size:图纸尺寸
is_annot:是否显示相关系数值
return :相关性热力图
"""
corr_df = df.loc[:, col_list].corr()
plt.figure(figsize=plt_size)
sns.heatmap(corr_df, annot=is_annot, cmap='rainbow', vmax=1, vmin=-1, mask=np.abs(corr_df) <= threshold)
return plt.show()
# 相关性剔除
def forward_delete_corr(df, col_list, threshold=None):
"""
df:数据集
col_list:变量list集合
threshold: 相关性设定的阈值
return:相关性剔除后的变量
"""
list_corr = col_list[:]
for col in list_corr:
corr = df.loc[:, list_corr].corr()[col]
corr_index = [x for x in corr.index if x != col]
corr_values = [x for x in corr.values if x != 1]
for i, j in zip(corr_index, corr_values):
if abs(j) >= threshold:
list_corr.remove(i)
return list_corr
# 相关性变量映射关系
def corr_mapping(df, col_list, threshold=None):
"""
df:数据集
col_list:变量list集合
threshold: 相关性设定的阈值
return:强相关性变量之间的映射关系表
"""
corr_df = df.loc[:, col_list].corr()
col_a = []
col_b = []
corr_value = []
for col, i in zip(col_list[:-1], range(1, len(col_list), 1)):
high_corr_col = []
high_corr_value = []
corr_series = corr_df[col][i:]
for i, j in zip(corr_series.index, corr_series.values):
if abs(j) >= threshold:
high_corr_col.append(i)
high_corr_value.append(j)
col_a.extend([col] * len(high_corr_col))
col_b.extend(high_corr_col)
corr_value.extend(high_corr_value)
corr_map_df = pd.DataFrame({'col_A': col_a,
'col_B': col_b,
'corr': corr_value})
return corr_map_df
# 显著性筛选,在筛选前需要做woe转换
def forward_delete_pvalue(x_train, y_train):
"""
x_train -- x训练集
y_train -- y训练集
return :显著性筛选后的变量
"""
col_list = list(x_train.columns)
pvalues_col = []
for col in col_list:
pvalues_col.append(col)
x_train2 = sm.add_constant(x_train.loc[:, pvalues_col])
sm_lr = sm.Logit(y_train, x_train2)
sm_lr = sm_lr.fit()
for i, j in zip(sm_lr.pvalues.index[1:], sm_lr.pvalues.values[1:]):
if j >= 0.05:
pvalues_col.remove(i)
x_new_train = x_train.loc[:, pvalues_col]
x_new_train2 = sm.add_constant(x_new_train)
lr = sm.Logit(y_train, x_new_train2)
lr = lr.fit()
print(lr.summary2())
return pvalues_col
# 逻辑回归系数符号筛选,在筛选前需要做woe转换
def forward_delete_coef(x_train, y_train):
"""
x_train -- x训练集
y_train -- y训练集
return :
coef_col回归系数符号筛选后的变量
lr_coe:每个变量的系数值
"""
col_list = list(x_train.columns)
coef_col = []
for col in col_list:
coef_col.append(col)
x_train2 = x_train.loc[:, coef_col]
sk_lr = LogisticRegression(random_state=0).fit(x_train2, y_train)
coef_df = pd.DataFrame({'col': coef_col, 'coef': sk_lr.coef_[0]})
if coef_df[coef_df.coef < 0].shape[0] > 0:
coef_col.remove(col)
x_new_train = x_train.loc[:, coef_col]
lr = LogisticRegression(random_state=0).fit(x_new_train, y_train)
lr_coe = pd.DataFrame({'col': coef_col,
'coef': lr.coef_[0]})
return coef_col, lr_coe
# 数据预处理
# 每个变量缺失率的计算
def missing_cal(df):
"""
df :数据集
return:每个变量的缺失率
"""
missing_series = df.isnull().sum() / df.shape[0]
missing_df = pd.DataFrame(missing_series).reset_index()
missing_df = missing_df.rename(columns={'index': 'col',
0: 'missing_pct'})
missing_df = missing_df.sort_values('missing_pct', ascending=False).reset_index(drop=True)
return missing_df
# 变量的缺失分布图
def plot_missing_var(df, plt_size=None):
"""
df: 数据集
plt_size :图纸的尺寸
return: 缺失分布图(直方图形式)
"""
missing_df = missing_cal(df)
plt.figure(figsize=plt_size)
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
x = missing_df['missing_pct']
plt.hist(x=x, bins=np.arange(0, 1.1, 0.1), color='hotpink', ec='k', alpha=0.8)
plt.ylabel('缺失值个数')
plt.xlabel('缺失率')
return plt.show()
# 单个样本的缺失分布
def plot_missing_user(df, plt_size=None):
"""
df: 数据集
plt_size: 图纸的尺寸
return :缺失分布图(折线图形式)
"""
missing_series = df.isnull().sum(axis=1)
list_missing_num = sorted(list(missing_series.values))
plt.figure(figsize=plt_size)
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
plt.plot(range(df.shape[0]), list_missing_num)
plt.ylabel('缺失变量个数')
plt.xlabel('sanples')
return plt.show()
# 缺失值剔除(单个变量)
def missing_delete_var(df, threshold=None):
"""
df:数据集
threshold:缺失率删除的阈值
return :删除缺失后的数据集
"""
df2 = df.copy()
missing_df = missing_cal(df)
missing_col_num = missing_df[missing_df.missing_pct >= threshold].shape[0]
missing_col = list(missing_df[missing_df.missing_pct >= threshold].col)
df2 = df2.drop(missing_col, axis=1)
print('缺失率超过{}的变量个数为{}'.format(threshold, missing_col_num))
return df2
# 缺失值剔除(单个样本)
def missing_delete_user(df, threshold=None):
"""
df:数据集
threshold:缺失个数删除的阈值
return :删除缺失后的数据集
"""
df2 = df.copy()
missing_series = df.isnull().sum(axis=1)
missing_list = list(missing_series)
missing_index_list = []
for i, j in enumerate(missing_list):
if j >= threshold:
missing_index_list.append(i)
df2 = df2[~(df2.index.isin(missing_index_list))]
print('缺失变量个数在{}以上的用户数有{}个'.format(threshold, len(missing_index_list)))
return df2
# 缺失值填充(类别型变量)
def fillna_cate_var(df, col_list, fill_type=None):
"""
df:数据集
col_list:变量list集合
fill_type: 填充方式:众数/当做一个类别
return :填充后的数据集
"""
df2 = df.copy()
for col in col_list:
if fill_type == 'class':
df2[col] = df2[col].fillna('unknown')
if fill_type == 'mode':
df2[col] = df2[col].fillna(df2[col].mode()[0])
return df2
# 数值型变量的填充
# 针对缺失率在5%以下的变量用中位数填充
# 缺失率在5%--15%的变量用随机森林填充,可先对缺失率较低的变量先用中位数填充,在用没有缺失的样本来对变量作随机森林填充
# 缺失率超过15%的变量建议当做一个类别
def fillna_num_var(df, col_list, fill_type=None, filled_df=None):
"""
df:数据集
col_list:变量list集合
fill_type:填充方式:中位数/随机森林/当做一个类别
filled_df :已填充好的数据集,当填充方式为随机森林时 使用
return:已填充好的数据集
"""
df2 = df.copy()
for col in col_list:
if fill_type == 'median':
df2[col] = df2[col].fillna(df2[col].median())
if fill_type == 'class':
df2[col] = df2[col].fillna(-999)
if fill_type == 'rf':
rf_df = pd.concat([df2[col], filled_df], axis=1)
known = rf_df[rf_df[col].notnull()]
unknown = rf_df[rf_df[col].isnull()]
x_train = known.drop([col], axis=1)
y_train = known[col]
x_pre = unknown.drop([col], axis=1)
rf = RandomForestRegressor(random_state=0)
rf.fit(x_train, y_train)
y_pre = rf.predict(x_pre)
df2.loc[df2[col].isnull(), col] = y_pre
return df2
# 常变量/同值化处理
def const_delete(df, col_list, threshold=None):
"""
df:数据集
col_list:变量list集合
threshold:同值化处理的阈值
return :处理后的数据集
"""
df2 = df.copy()
const_col = []
for col in col_list:
const_pct = df2[col].value_counts().iloc[0] / df2[df2[col].notnull()].shape[0]
if const_pct >= threshold:
const_col.append(col)
df2 = df2.drop(const_col, axis=1)
print('常变量/同值化处理的变量个数为{}'.format(len(const_col)))
return df2
# 分类型变量的降基处理
def descending_cate(df, col_list, threshold=None):
"""
df: 数据集
col_list:变量list集合
threshold:降基处理的阈值
return :处理后的数据集
"""
df2 = df.copy()
for col in col_list:
value_series = df[col].value_counts() / df[df[col].notnull()].shape[0]
small_value = []
for value_name, value_pct in zip(value_series.index, value_series.values):
if value_pct <= threshold:
small_value.append(value_name)
df2.loc[df2[col].isin(small_value), col] = 'other'
return df2
# 模型评估
# AUC
def plot_roc(y_label, y_pred):
"""
y_label:测试集的y
y_pred:对测试集预测后的概率
return:ROC曲线
"""
tpr, fpr, threshold = metrics.roc_curve(y_label, y_pred)
AUC = metrics.roc_auc_score(y_label, y_pred)
fig = plt.figure(figsize=(6, 4))
ax = fig.add_subplot(1, 1, 1)
ax.plot(tpr, fpr, color='blue', label='AUC=%.3f' % AUC)
ax.plot([0, 1], [0, 1], 'r--')
ax.set_ylim(0, 1)
ax.set_xlim(0, 1)
ax.set_title('ROC')
ax.legend(loc='best')
return plt.show(ax)
# KS
def plot_model_ks(y_label, y_pred):
"""
y_label:测试集的y
y_pred:对测试集预测后的概率
return:KS曲线
"""
pred_list = list(y_pred)
label_list = list(y_label)
total_bad = sum(label_list)
total_good = len(label_list) - total_bad
items = sorted(zip(pred_list, label_list), key=lambda x: x[0])
step = (max(pred_list) - min(pred_list)) / 200
pred_bin = []
good_rate = []
bad_rate = []
ks_list = []
for i in range(1, 201):
idx = min(pred_list) + i * step
pred_bin.append(idx)
label_bin = [x[1] for x in items if x[0] < idx]
bad_num = sum(label_bin)
good_num = len(label_bin) - bad_num
goodrate = good_num / total_good
badrate = bad_num / total_bad
ks = abs(goodrate - badrate)
good_rate.append(goodrate)
bad_rate.append(badrate)
ks_list.append(ks)
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(pred_bin, good_rate, color='green', label='good_rate')
ax.plot(pred_bin, bad_rate, color='red', label='bad_rate')
ax.plot(pred_bin, ks_list, color='blue', label='good-bad')
ax.set_title('KS:{:.3f}'.format(max(ks_list)))
ax.legend(loc='best')
return plt.show(ax)
# 交叉验证
def cross_verify(x, y, estimators, fold, scoring='roc_auc'):
"""
x:自变量的数据集
y:target的数据集
estimators:验证的模型
fold:交叉验证的策略
scoring:评级指标,默认auc
return:交叉验证的结果
"""
cv_result = cross_val_score(estimator=estimators, X=x, y=y, cv=fold, n_jobs=-1, scoring=scoring)
print('CV的最大AUC为:{}'.format(cv_result.max()))
print('CV的最小AUC为:{}'.format(cv_result.min()))
print('CV的平均AUC为:{}'.format(cv_result.mean()))
plt.figure(figsize=(6, 4))
plt.title('交叉验证的评价指标分布图')
plt.boxplot(cv_result, patch_artist=True, showmeans=True,
boxprops={'color': 'black', 'facecolor': 'yellow'},
meanprops={'marker': 'D', 'markerfacecolor': 'tomato'},
flierprops={'marker': 'o', 'markerfacecolor': 'red', 'color': 'black'},
medianprops={'linestyle': '--', 'color': 'orange'})
return plt.show()
# 学习曲线
def plot_learning_curve(estimator, x, y, cv=None, train_size=np.linspace(0.1, 1.0, 5), plt_size=None):
"""
estimator :画学习曲线的基模型
x:自变量的数据集
y:target的数据集
cv:交叉验证的策略
train_size:训练集划分的策略
plt_size:画图尺寸
return:学习曲线
"""
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(estimator=estimator,
X=x,
y=y,
cv=cv,
n_jobs=-1,
train_sizes=train_size)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.figure(figsize=plt_size)
plt.xlabel('Training-example')
plt.ylabel('score')
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color='g')
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training-score')
plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='cross-val-score')
plt.legend(loc='best')
return plt.show()
# 混淆矩阵 /分类报告
def plot_matrix_report(y_label, y_pred):
"""
y_label:测试集的y
y_pred:对测试集预测后的概率
return:混淆矩阵
"""
matrix_array = metrics.confusion_matrix(y_label, y_pred)
plt.matshow(matrix_array, cmap=plt.cm.summer_r)
plt.colorbar()
for x in range(len(matrix_array)):
for y in range(len(matrix_array)):
plt.annotate(matrix_array[x, y], xy=(x, y), ha='center', va='center')
plt.xlabel('True label')
plt.ylabel('Predict label')
print(metrics.classification_report(y_label, y_pred))
return plt.show()
# 评分卡实现
# 评分卡刻度
def cal_scale(score, odds, PDO, model):
"""
odds:设定的坏好比
score:在这个odds下的分数
PDO: 好坏翻倍比
model:逻辑回归模型
return :A,B,base_score
"""
B = 20 / (np.log(odds) - np.log(2 * odds))
A = score - B * np.log(odds)
base_score = A + B * model.intercept_[0]
print('B: {:.2f}'.format(B))
print('A: {:.2f}'.format(A))
print('基础分为:{:.2f}'.format(base_score))
return A, B, base_score
# 变量得分表
def score_df_concat(woe_df, model, B):
"""
woe_df: woe结果表
model:逻辑回归模型
return:变量得分结果表
"""
coe = list(model.coef_[0])
columns = list(woe_df.col.unique())
scores = []
for c, col in zip(coe, columns):
score = []
for w in list(woe_df[woe_df.col == col].woe):
s = round(c * w * B, 0)
score.append(s)
scores.extend(score)
woe_df['score'] = scores
score_df = woe_df.copy()
return score_df
# 分数转换
def score_transform(df, target, df_score):
"""
df:数据集
target:目标变量的字段名
df_score:得分结果表
return:得分转化之后的数据集
"""
df2 = df.copy()
for col in df2.drop([target], axis=1).columns:
x = df2[col]
bin_map = df_score[df_score.col == col]
bin_res = np.array([0] * x.shape[0], dtype=float)
for i in bin_map.index:
lower = bin_map['min_bin'][i]
upper = bin_map['max_bin'][i]
if lower == upper:
x1 = x[np.where(x == lower)[0]]
else:
x1 = x[np.where((x >= lower) & (x <= upper))[0]]
mask = np.in1d(x, x1)
bin_res[mask] = bin_map['score'][i]
bin_res = pd.Series(bin_res, index=x.index)
bin_res.name = x.name
df2[col] = bin_res
return df2
# 得分的KS
def plot_score_ks(df, score_col, target):
"""
df:数据集
target:目标变量的字段名
score_col:最终得分的字段名
"""
total_bad = df[target].sum()
total_good = df[target].count() - total_bad
score_list = list(df[score_col])
target_list = list(df[target])
items = sorted(zip(score_list, target_list), key=lambda x: x[0])
step = (max(score_list) - min(score_list)) / 200
score_bin = []
good_rate = []
bad_rate = []
ks_list = []
for i in range(1, 201):
idx = min(score_list) + i * step
score_bin.append(idx)
target_bin = [x[1] for x in items if x[0] < idx]
bad_num = sum(target_bin)
good_num = len(target_bin) - bad_num
goodrate = good_num / total_good
badrate = bad_num / total_bad
ks = abs(goodrate - badrate)
good_rate.append(goodrate)
bad_rate.append(badrate)
ks_list.append(ks)
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(score_bin, good_rate, color='green', label='good_rate')
ax.plot(score_bin, bad_rate, color='red', label='bad_rate')
ax.plot(score_bin, ks_list, color='blue', label='good-bad')
ax.set_title('KS:{:.3f}'.format(max(ks_list)))
ax.legend(loc='best')
return plt.show(ax)
# PR曲线
def plot_PR(df, score_col, target, plt_size=None):
"""
df:得分的数据集
score_col:分数的字段名
target:目标变量的字段名
plt_size:绘图尺寸
return: PR曲线
"""
total_bad = df[target].sum()
score_list = list(df[score_col])
target_list = list(df[target])
score_unique_list = sorted(set(list(df[score_col])))
items = sorted(zip(score_list, target_list), key=lambda x: x[0])
precison_list = []
tpr_list = []
for score in score_unique_list:
target_bin = [x[1] for x in items if x[0] <= score]
bad_num = sum(target_bin)
total_num = len(target_bin)
precison = bad_num / total_num
tpr = bad_num / total_bad
precison_list.append(precison)
tpr_list.append(tpr)
plt.figure(figsize=plt_size)
plt.title('PR曲线')
plt.xlabel('查全率')
plt.ylabel('精确率')
plt.plot(tpr_list, precison_list, color='tomato', label='PR曲线')
plt.legend(loc='best')
return plt.show()
# 得分分布图
def plot_score_hist(df, target, score_col, plt_size=None, cutoff=None):
"""
df:数据集
target:目标变量的字段名
score_col:最终得分的字段名
plt_size:图纸尺寸
cutoff :划分拒绝/通过的点
return :好坏用户的得分分布图
"""
plt.figure(figsize=plt_size)
x1 = df[df[target] == 1][score_col]
x2 = df[df[target] == 0][score_col]
sns.kdeplot(x1, shade=True, label='坏用户', color='hotpink')
sns.kdeplot(x2, shade=True, label='好用户', color='seagreen')
plt.axvline(x=cutoff)
plt.legend()
return plt.show()
# 得分明细表
def score_info(df, score_col, target, x=None, y=None, step=None):
"""
df:数据集
target:目标变量的字段名
score_col:最终得分的字段名
x:最小区间的左值
y:最大区间的右值
step:区间的分数间隔
return :得分明细表
"""
df['score_bin'] = pd.cut(df[score_col], bins=np.arange(x, y, step), right=True)
total = df[target].count()
bad = df[target].sum()
good = total - bad
group = df.groupby('score_bin')
score_info_df = pd.DataFrame()
score_info_df['用户数'] = group[target].count()
score_info_df['坏用户'] = group[target].sum()
score_info_df['好用户'] = score_info_df['用户数'] - score_info_df['坏用户']
score_info_df['违约占比'] = score_info_df['坏用户'] / score_info_df['用户数']
score_info_df['累计用户'] = score_info_df['用户数'].cumsum()
score_info_df['坏用户累计'] = score_info_df['坏用户'].cumsum()
score_info_df['好用户累计'] = score_info_df['好用户'].cumsum()
score_info_df['坏用户累计占比'] = score_info_df['坏用户累计'] / bad
score_info_df['好用户累计占比'] = score_info_df['好用户累计'] / good
score_info_df['累计用户占比'] = score_info_df['累计用户'] / total
score_info_df['累计违约占比'] = score_info_df['坏用户累计'] / score_info_df['累计用户']
score_info_df = score_info_df.reset_index()
return score_info_df
# 绘制提升图和洛伦兹曲线
def plot_lifting(df, score_col, target, bins=10, plt_size=None):
"""
df:数据集,包含最终的得分
score_col:最终分数的字段名
target:目标变量名
bins:分数划分成的等份数
plt_size:绘图尺寸
return:提升图和洛伦兹曲线
"""
score_list = list(df[score_col])
label_list = list(df[target])
items = sorted(zip(score_list, label_list), key=lambda x: x[0])
step = round(df.shape[0] / bins, 0)
bad = df[target].sum()
all_badrate = float(1 / bins)
all_badrate_list = [all_badrate] * bins
all_badrate_cum = list(np.cumsum(all_badrate_list))
all_badrate_cum.insert(0, 0)
score_bin_list = []
bad_rate_list = []
for i in range(0, bins, 1):
index_a = int(i * step)
index_b = int((i + 1) * step)
score = [x[0] for x in items[index_a:index_b]]
tup1 = (min(score),)
tup2 = (max(score),)
score_bin = tup1 + tup2
score_bin_list.append(score_bin)
label_bin = [x[1] for x in items[index_a:index_b]]
bin_bad = sum(label_bin)
bin_bad_rate = bin_bad / bad
bad_rate_list.append(bin_bad_rate)
bad_rate_cumsum = list(np.cumsum(bad_rate_list))
bad_rate_cumsum.insert(0, 0)
plt.figure(figsize=plt_size)
x = score_bin_list
y1 = bad_rate_list
y2 = all_badrate_list
y3 = bad_rate_cumsum
y4 = all_badrate_cum
plt.subplot(1, 2, 1)
plt.title('提升图')
plt.xticks(np.arange(bins) + 0.15, x, rotation=90)
bar_width = 0.3
plt.bar(np.arange(bins), y1, width=bar_width, color='hotpink', label='score_card')
plt.bar(np.arange(bins) + bar_width, y2, width=bar_width, color='seagreen', label='random')
plt.legend(loc='best')
plt.subplot(1, 2, 2)
plt.title('洛伦兹曲线图')
plt.plot(y3, color='hotpink', label='score_card')
plt.plot(y4, color='seagreen', label='random')
plt.xticks(np.arange(bins + 1), rotation=0)
plt.legend(loc='best')
return plt.show()
# 设定cutoff点,衡量有效性
def rule_verify(df, col_score, target, cutoff):
"""
df:数据集
target:目标变量的字段名
col_score:最终得分的字段名
cutoff :划分拒绝/通过的点
return :混淆矩阵
"""
df['result'] = df.apply(lambda x: 30 if x[col_score] <= cutoff else 10, axis=1)
TP = df[(df['result'] == 30) & (df[target] == 1)].shape[0]
FN = df[(df['result'] == 30) & (df[target] == 0)].shape[0]
bad = df[df[target] == 1].shape[0]
good = df[df[target] == 0].shape[0]
refuse = df[df['result'] == 30].shape[0]
passed = df[df['result'] == 10].shape[0]
acc = round(TP / refuse, 3)
tpr = round(TP / bad, 3)
fpr = round(FN / good, 3)
pass_rate = round(refuse / df.shape[0], 3)
matrix_df = pd.pivot_table(df, index='result', columns=target, aggfunc={col_score: pd.Series.count},
values=col_score)
print('精确率:{}'.format(acc))
print('查全率:{}'.format(tpr))
print('误伤率:{}'.format(fpr))
print('规则拒绝率:{}'.format(pass_rate))
return matrix_df
# 绘制变量的得分占比偏移图
def plot_var_shift(df, day_col, score_col, plt_size=None):
"""
df:变量在一段时间内,每个区间上的得分
day_col:时间的字段名(天)
score_col:得分的字段名
plt_size: 绘图尺寸
return:变量区间得分的偏移图
"""
day_list = sorted(set(list(df[day_col])))
score_list = sorted(set(list(df[score_col])))
# 计算每天各个区间得分的占比
prop_day_list = []
for day in day_list:
prop_list = []
for score in score_list:
prop = df[(df[day_col] == day) & (df[score_col] == score)].shape[0] / df[df[day_col] == day].shape[0]
prop_list.append(prop)
prop_day_list.append(prop_list)
# 将得分占比的转化为画图的格式
sub_list = []
for p in prop_day_list:
p_cumsum = list(np.cumsum(p))
p_cumsum = p_cumsum[:-1]
p_cumsum.insert(0, 0)
bar1_list = [1] * int(len(p_cumsum))
sub = [bar1_list[i] - p_cumsum[i] for i in range(len(p_cumsum))]
sub_list.append(sub)
array = np.array(sub_list)
stack_prop_list = [] # 面积图的y值
bar_prop_list = [] # 堆积柱状图的y
for i in range(len(score_list)):
bar_prop = array[:, i]
bar_prop_list.append(bar_prop)
stack_prop = []
for j in bar_prop:
a = j
b = j
stack_prop.append(a)
stack_prop.append(b)
stack_prop_list.append(stack_prop)
# 画图的x坐标轴
x_bar = list(range(1, len(day_list) * 2, 2)) # 堆积柱状图的x值
x_stack = [] # 面积图的x值
for i in x_bar:
c = i - 0.5
d = i + 0.5
x_stack.append(c)
x_stack.append(d)
# 绘图
fig = plt.figure(figsize=plt_size)
ax1 = fig.add_subplot(1, 1, 1)
# 先清除x轴的刻度
ax1.xaxis.set_major_formatter(plt.FuncFormatter(''.format))
ax1.set_xticks(range(1, len(day_list) * 2, 2))
# 将y轴的刻度设置为百分比形式
def to_percent(temp, position):
return '%1.0f' % (100 * temp) + '%'
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(to_percent))
# 自定义x轴刻度标签
for a, b in zip(x_bar, day_list):
ax1.text(a, -0.08, b, ha='center', va='bottom')
# 绘制面积图和堆积柱状图
for i, s in zip(range(len(day_list)), score_list):
ax1.stackplot(x_stack, stack_prop_list[i], alpha=0.25)
ax1.bar(x_bar, bar_prop_list[i], width=1, label='得分:{}'.format(s))
# 添加y轴刻度虚线
ax1.grid(True, 'major', 'y', ls='--', lw=.5, c='black', alpha=.3)
ax1.legend(loc='best')
plt.show()
# 计算评分的PSI
def score_psi(df1, df2, id_col, score_col, x, y, step=None):
"""
df1:建模样本的得分,包含用户id,得分
df2:上线样本的得分,包含用户id,得分
id_col:用户id字段名
score_col:得分的字段名
x:划分得分区间的left值
y:划分得分区间的right值
step:步长
return: 得分psi表
"""
df1['score_bin'] = pd.cut(df1[score_col], bins=np.arange(x, y, step))
model_score_group = df1.groupby('score_bin', as_index=False)[id_col].count().assign(
pct=lambda x: x[id_col] / x[id_col].sum()).rename(columns={id_col: '建模样本户数',
'pct': '建模户数占比'})
df2['score_bin'] = pd.cut(df2[score_col], bins=np.arange(x, y, step))
online_score_group = df2.groupby('score_bin', as_index=False)[id_col].count().assign(
pct=lambda x: x[id_col] / x[id_col].sum()).rename(columns={id_col: '线上样本户数',
'pct': '线上户数占比'})
score_compare = pd.merge(model_score_group, online_score_group, on='score_bin', how='inner')
score_compare['占比差异'] = score_compare['线上户数占比'] - score_compare['建模户数占比']
score_compare['占比权重'] = np.log(score_compare['线上户数占比'] / score_compare['建模户数占比'])
score_compare['Index'] = score_compare['占比差异'] * score_compare['占比权重']
score_compare['PSI'] = score_compare['Index'].sum()
return score_compare
# 评分比较分布图
def plot_score_compare(df, plt_size=None):
fig = plt.figure(figsize=plt_size)
x = df.score_bin
y1 = df.建模户数占比
y2 = df.线上户数占比
width = 0.3
plt.title('评分分布对比图')
plt.xlabel('得分区间')
plt.ylabel('用户占比')
plt.xticks(np.arange(len(x)) + 0.15, x)
plt.bar(np.arange(len(y1)), y1, width=width, color='seagreen', label='建模样本')
plt.bar(np.arange(len(y2)) + width, y2, width=width, color='hotpink', label='上线样本')
plt.legend()
return plt.show()
# 变量稳定度分析
def var_stable(score_result, df, var, id_col, score_col, bins):
"""
score_result:评分卡的score明细表,包含区间,用户数,用户占比,得分
var:分析的变量名
df:上线样本变量的得分,包含用户id,变量的value,变量的score
id_col:df的用户id字段名
score_col:df的得分字段名
bins:变量划分的区间
return :变量的稳定性分析表
"""
model_var_group = score_result.loc[score_result.col == var, ['bin', 'total', 'totalrate', 'score']].reset_index(
drop=True).rename(columns={'total': '建模用户数',
'totalrate': '建模用户占比',
'score': '得分'})
df['bin'] = pd.cut(df[score_col], bins=bins)
online_var_group = df.groupby('bin', as_index=False)[id_col].count().assign(
pct=lambda x: x[id_col] / x[id_col].sum()).rename(columns={id_col: '线上用户数',
'pct': '线上用户占比'})
var_stable_df = pd.merge(model_var_group, online_var_group, on='bin', how='inner')
var_stable_df = var_stable_df.iloc[:, [0, 3, 1, 2, 4, 5]]
var_stable_df['得分'] = var_stable_df['得分'].astype('int64')
var_stable_df['建模样本权重'] = np.abs(var_stable_df['得分'] * var_stable_df['建模用户占比'])
var_stable_df['线上样本权重'] = np.abs(var_stable_df['得分'] * var_stable_df['线上用户占比'])
var_stable_df['权重差距'] = var_stable_df['线上样本权重'] - var_stable_df['建模样本权重']
return var_stable_df