5 连续型变量自动分箱
在评分卡建模过程中,数据中的连续型变量需要进行分箱,用于计算woe值。
这里使用卡方分箱进行分箱
# 卡方分箱
def Chi_merge(X, y, columns, k=6):
item = dict()
pinf = float('inf') # 正无穷大
ninf = float('-inf') # 负无穷大
# 需要选取连续变量,以及分箱的个数
# 1.制作数据结构
# [feature, label] --> [feature, label, count] --> [feature, [count1, count2, count3]]
split_points = pd.DataFrame()
for col in columns:
df = pd.concat([X[col],y], axis=1)
# print(df.head())
# 统计重复特征个数
df_gby = df.groupby(df.columns.values.tolist()).size().reset_index()
df_gby.columns = ['feature', 'label', 'count']
zeros = np.zeros(len(df_gby['label'].unique()),dtype=int)
label_cnt = []
for i in range(df_gby.shape[0]):
label_cnt.append(copy.deepcopy(zeros))
df_gby['label_cnt'] = label_cnt
# 计数频数写进数据结构
for _, loc in df_gby.iterrows():
if loc[1] in range(len(df_gby['label'].unique())):
index = loc[1]
loc[3][index] = loc[2]
else:
raise TypeError("Data Excption")
df_cnt = df_gby[['feature','label_cnt']]
# 重复项求和
for i in range(df_cnt.shape[0]):
if df_cnt.iloc[i][0] == df_cnt.iloc[i-1][0]:
for j in range(len(df_gby['label'].unique())):
df_cnt.iloc[i][1][j] += df_cnt.iloc[i-1][1][j]
# 丢掉重复行
df_cnt.drop_duplicates(subset=['feature'], keep='last', inplace=True)
df_cnt.reset_index(drop=True, inplace=True)
# 数据结构存入tuple
tuple = []
for i in range(df_cnt.shape[0]):
tuple.append((str(df_cnt.iloc[i][0]),list(df_cnt.iloc[i][1])))
# 2.卡方计算与合并
# 卡方值计算
def chi_square(A):
m = len(A)
k = len(A[0])
R = []
for i in range(m):
sum = 0
for j in range(k):
sum += A[i][j]
R.append(sum)
C = []
for j in range(k):
sum = 0
for i in range(m):
sum += A[i][j]
C.append(sum)
N = 0
for ele in C:
N += ele
res = 0
for i in range(m):
for j in range(k):
Eij = R[i] * C[j] / N
if Eij != 0:
res = res + (A[i][j] - Eij) ** 2 / Eij
return res
def combine(a, b):
''' a=('4.4', [3, 1, 0]), b=('4.5', [1, 0, 2])
combine(a,b)=('4.4', [4, 1, 2]) '''
c = a[:] # c[0]=a[0]
for i in range(len(a[1])):
c[1][i] += b[1][i]
return (c)
# 区间合并
num_interval = len(tuple)
while num_interval > k:
num_pair = num_interval - 1
chi_values = []
for i in range(num_pair):
arr = [tuple[i][1],tuple[i+1][1]]
chi_values.append(chi_square(arr))
min_chi = min(chi_values) # get the minimum chi value
for i in range(num_pair - 1, -1, -1):
if chi_values[i] == min_chi:
tuple[i] = combine(tuple[i], tuple[i + 1])
tuple[i + 1] = 'Merged'
while 'Merged' in tuple:
tuple.remove('Merged')
num_interval = len(tuple)
interval = [float(record[0]) for record in tuple]
del(interval[0])
# split_points[col] = interval
# print(interval)
interval.insert(0, ninf)
interval.append(pinf)
item[col] = interval
return item
分箱:
# 连续型变量自动分箱
X, y = split_column(df, "Y")
result = Chi_merge(X, y, col_values, k=8)
print("======================连续变量分箱==========================")
for r in result.items():
print(r)
for k, v in result.items():
col_item[k] = dict()
col_item[k]["cut"] = v
col_item[k]["type"] = "cut"
分箱结果:
======================连续变量分箱==========================
('年龄', [-inf, 21.0, 24.0, 28.0, 30.0, 34.0, 42.0, 60.0, inf])
('投资损失', [-inf, 1573.0, 1825.0, 1887.0, 1944.0, 1977.0, 1980.0, 2392.0, inf])
('教育时间', [-inf, 3.0, 9.0, 10.0, 11.0, 13.0, 14.0, 15.0, inf])
('工作天数', [-inf, 3.0, 35.0, 40.0, 42.0, 50.0, 51.0, 62.0, inf])
('投资收入', [-inf, 114.0, 3103.0, 3137.0, 4386.0, 4416.0, 5178.0, 99999.0, inf])
6 计算woe值
由于连续变量和离散变量两者分段不一致,连续变量和离散变量这里分开计算
# 计算woe值和iv值
# 连续变量的woe值
for k, v in col_item.items():
# print("=== {}".format(k))
col = k
bin_list = v["cut"]
count = len(bin_list)
col_item[col]["woe"] = []
for index in range(count):
if index != count - 1:
bin_count = len(df.loc[(df[col] > bin_list[index]) & (df[col] <= bin_list[index + 1]), col])
bad_count = len(df.loc[(df[col] > bin_list[index]) & (df[col] <= bin_list[index + 1]) & (df["Y"] == 0), col])
good_count = bin_count - bad_count
# print("[{}, {}], good_count: {} bad_count: {}".format(bin_list[index], bin_list[index + 1], bin_count, good_count, bad_count))
if (bad_count == 0) or (good_count == 0):
woe = 0
else:
woe = (bad_count / total_bad) / (good_count / total_good)
woe = math.log(woe)
col_item[col]["woe"].append(woe)
# print("woe {}".format(woe))
# 离散变量的woe值
for col in col_labels:
col_item[col] = dict()
result = df[col].value_counts()
cols = result.index
result_dict = {k: v for k, v in zip(cols, list(result))}
# print(result_dict)
types_list = result_dict.keys()
col_item[col]["cut"] = list(types_list)
col_item[col]["type"] = "type"
col_item[col]["woe"] = []
for k in types_list:
bad_count = len(df.loc[(df[col] == k) & (df["Y"] == 0), col])
good_count = len(df.loc[(df[col] == k) & (df["Y"] == 1), col])
# print("[{}] total: {}, good: {}, bad: {}".format(k, v, bad_count, good_count))
if (bad_count == 0) or (good_count == 0):
woe = 0
else:
woe = (bad_count / total_bad) / (good_count / total_good)
woe = math.log(woe)
col_item[col]["woe"].append(woe)
# print("woe {}".format(woe))
7 原始数据替换为woe值
将原始数据替换为woe值然后进行逻辑回归训练。
# 替换为woe值
for k, v in col_item.items():
# 连续变量
if v["type"] == "cut":
col = k
bin_list, woex = v["cut"], v["woe"]
count = len(bin_list)
for index in range(count):
if index != count - 1:
# print("{}, [ {} < {} ]".format(woex[index], bin_list[index], bin_list[index + 1]))
df.loc[(df[col] > bin_list[index]) & (df[col] <= bin_list[index + 1]), col] = woex[index]
# 离散变量
else:
col = k
bin_list, woex = v["cut"], v["woe"]
for index, type_name in enumerate(bin_list):
df.loc[(df[col] == bin_list[index]), col] = woex[index]
8 逻辑回归
使用转换后的woe数据训练高收入预测二分类模型,计算的时候增加一个配置项,用于评分卡的计算
# 逻辑回归计算系数
X, y = split_column(df, y="Y")
drop_list = []
X = X.drop(drop_list, axis=1)
X['bia'] = 1
logic = LogisticRegression(penalty="l2", class_weight=None, C=1)
lg = logic.fit(X, y)
coef_list = lg.coef_[0]
coef_dict = {k: v for k, v in zip(X.columns.values, coef_list)}
print("======================coef 系数==========================")
print(coef_dict)
逻辑回归系数
======================coef 系数==========================
('工作情况', -0.08107345111649898)
('性别', -0.12778166055899795)
('bia', 3.3405391750781788)
('家庭角色', -0.09961242632590088)
('教育', -0.3154451804440804)
('教育时间', -0.5790788213669904)
('婚姻状况', -0.3732197877225377)
('年龄', -0.1772763593749565)
('投资收入', -2.5629206586528372)
('投资损失', -1.3755660231932954)
('职业类型', -0.25562556502783673)
('工作天数', -0.3618937334098486)
9 建立评分卡
这里就不重复介绍评分卡的公式和原理了,我们先计算出公式的系数,然后计算不同分段的得分权值,最后加总。
直接上代码:
# 计算分数
P = 600 # 指定比例时的分值
PD0 = 20 # 两倍比例的的分差值
default_ratio = 10 # 非高收入/高收入
A = P - (PD0 * math.log(default_ratio)) / math.log(2)
B = PD0 / math.log(2)
print("公式参数 A: {} B: {}".format(A, B))
base_score = round(A + B * coef_dict["bia"])
print("基础分值: {}".format(base_score))
print("======================coef 系数==========================")
for c in coef_dict.items():
print(c)
items = []
for col in list(X.columns.values):
if col != "bia":
item = dict()
item["col"] = col
item["coef"] = coef_dict[col]
item["cut"] = col_item[col]["cut"]
item["woe"] = col_item[col]["woe"]
items.append(item)
# print("===dict")
# for i in items:
# print(i)
# 评分卡
print("======================评分标准==========================")
for item in items:
print("====== {}".format(item["col"]))
score = []
for index, w in enumerate(item["woe"]):
score = round(item["coef"] * w * B)
if item['col'] in col_values:
print("{}~{} => {}".format(item["cut"][index], item["cut"][index + 1], score))
else:
print("{} => {}".format(item["cut"][index], score))
得到的评分标准如下:
======================评分标准==========================
基础分值: 630.0
====== 年龄
-inf~21.0 => -90.0
21.0~24.0 => -45.0
24.0~28.0 => -19.0
28.0~30.0 => -6.0
30.0~34.0 => -0.0
34.0~42.0 => 7.0
42.0~60.0 => 12.0
60.0~inf => -0.0
====== 工作天数
-inf~3.0 => -10.0
3.0~35.0 => -18.0
35.0~40.0 => -2.0
40.0~42.0 => 4.0
42.0~50.0 => 11.0
50.0~51.0 => 3.0
51.0~62.0 => 12.0
62.0~inf => 8.0
====== 职业类型
保安 => 6.0
运输 => -3.0
机械操作 => -12.0
执行主管 => 15.0
管理文书 => -10.0
农业捕捞 => -13.0
其他职业 => -28.0
手工艺维修 => -1.0
销售 => 2.0
劳工保洁 => -21.0
家政服务 => -49.0
军人 => 12.0
技术支持 => 4.0
专业技术 => 14.0
未知 => -15.0
====== 投资收入
-inf~114.0 => -6.0
114.0~3103.0 => -11.0
3103.0~3137.0 => -0.0
3137.0~4386.0 => -7.0
4386.0~4416.0 => -0.0
4416.0~5178.0 => 28.0
5178.0~99999.0 => 114.0
99999.0~inf => -0.0
====== 投资损失
-inf~1573.0 => -2.0
1573.0~1825.0 => -90.0
1825.0~1887.0 => 71.0
1887.0~1944.0 => 103.0
1944.0~1977.0 => 83.0
1977.0~1980.0 => -0.0
1980.0~2392.0 => -3.0
2392.0~inf => 72.0
====== 教育
专科 => 2.0
职高 => 52.0
小学 => -46.0
高中 => -16.0
幼儿园 => -70.0
本科 => 20.0
研究生 => 33.0
初中 => -38.0
本科肄业 => -7.0
博士 => 51.0
====== 家庭角色
未婚 => -21.0
其他关系 => -30.0
离家 => -14.0
妻子 => 14.0
丈夫 => 13.0
孩子 => -41.0
====== 婚姻状况
未婚 => -28.0
已婚配偶异地 => -19.0
已婚 => 14.0
分居 => -22.0
离婚 => -16.0
丧偶 => -18.0
====== 教育时间
-inf~3.0 => 7.0
3.0~9.0 => 3.0
9.0~10.0 => 1.0
10.0~11.0 => -0.0
11.0~13.0 => -3.0
13.0~14.0 => -5.0
14.0~15.0 => -8.0
15.0~inf => -8.0
====== 工作情况
省政府 => 1.0
中央部委 => 0.0
非有限责任公司 => 0.0
有限责任公司 => 3.0
地方政府 => 1.0
个体 => -0.0
未知 => -2.0
====== 性别
男 => 0.0
女 => -1.0
下一步我们编写代码对测试集数据进行评分的计算和评分卡监控。