Author:LieDra
前言
下面将对数据利用Logistic得到结果。
逻辑回归算法介绍
对一些现有的数据点进行值的拟合过程,就叫做回归。
逻辑回归虽然带回归二字,但一般并不算是用于回归,更多的还是偏向于分类。
需要找出一个预测函数模型,使其值输出在[0,1]之间。然后选择一个基准值,如0.5,如果算出来的预测值大于0.5,就认为其预测值为1,反之则其预测值为0。
这里的预测函数一般就是Logistic函数(也叫Sigmoid函数)。
考虑样本的预测值与真实值的误差,逻辑回归中常用的损失函数是自然对数函数。
逻辑回归模型参数可以通过梯度下降算法来求。
使用Sklearn时,我们并不需要考虑那么多的细节,只需要调整好部分参数即可。
代码
'''
version:
author:LieDra
Method:Logistic Regression--非标准化数据
'''
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split #划分数据
from sklearn.linear_model.logistic import LogisticRegression #逻辑回归
# import statsmodels.api as sm
#混淆矩阵计算
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
df_german=pd.read_excel("D:/study/5/code/python/python Data analysis and mining/class/dataset/german.xls")
y=df_german.ix[:,-1]
x=df_german.ix[:,:-1]
l1 = []
for i in range(1000):
print('****'*50)
print('第',i+1,'次test')
# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,train_size=0.6,random_state=i+1)
x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=i+1)
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
# predictions = classifier.predict(X_test)
#验证
print("accuracy on the training subset:{:.3f}".format(classifier.score(x_train,y_train)))
print("accuracy on the check subset:{:.3f}".format(classifier.score(x_check,y_check)))
l1.append(classifier.score(x_check,y_check))
print('max index',l1.index(max(l1))+1)
#得分公式
'''
P0 = 50
PDO = 10
theta0 = 1.0/20
B = PDO/np.log(2)
A = P0 + B*np.log(theta0)
'''
def Score(probability):
#底数是e
score = A-B*np.log(probability/(1-probability))
return score
#批量获取得分
def List_score(pos_probablity_list):
list_score=[]
for probability in pos_probablity_list:
score=Score(probability)
list_score.append(score)
return list_score
P0 = 50
PDO = 10
theta0 = 1.0/20
B = PDO/np.log(2)
A = P0 + B*np.log(theta0)
print("A:",A)
print("B:",B)
list_coef = list(classifier.coef_[0])
intercept= classifier.intercept_
# g(x) = w1x1 + w2x2 + w3x3 + w4x4 + w0 则coef_表示的是w1----w4,intercept_表示
#获取所有x数据的预测概率,包括好客户和坏客户,0为好客户,1为坏客户
probablity_list=classifier.predict_proba(x) # 输出分类概率。返回每种类别的概率,按照分类类别顺序给出。
#获取所有x数据的坏客户预测概率
pos_probablity_list=[i[1] for i in probablity_list]
#获取所有客户分数
list_score=List_score(pos_probablity_list)
list_predict=classifier.predict(x)
df_result=pd.DataFrame({"label":y,"predict":list_predict,"pos_probablity":pos_probablity_list,"score":list_score})
# df_result.to_excel("score_proba.xlsx")
# 打印结果
print(df_result)
#变量名列表
list_vNames=df_german.columns
#去掉第一个变量名target
list_vNames=list_vNames[0:-1]
df_coef=pd.DataFrame({"variable_names":list_vNames,"coef":list_coef})
# df_coef.to_excel("coef.xlsx")
# 打印变量相关度
print(df_coef)
y_true=y_test
y_pred=classifier.predict(x_test)
accuracyScore = accuracy_score(y_true, y_pred)
print('model accuracy is:',accuracyScore)
#precision,TP/(TP+FP) (真阳性)/(真阳性+假阳性)
precision=precision_score(y_true, y_pred)
print('model precision is:',precision)
#recall(sensitive)敏感度,(TP)/(TP+FN)
sensitivity=recall_score(y_true, y_pred)
print('model sensitivity is:',sensitivity)
#F1 = 2 x (精确率 x 召回率) / (精确率 + 召回率)
#F1 分数会同时考虑精确率和召回率,以便计算新的分数。可将 F1 分数理解为精确率和召回率的加权平均值,其中 F1 分数的最佳值为 1、最差值为 0:
f1Score=f1_score(y_true, y_pred)
print("f1_score:",f1Score)
def AUC(y_true, y_scores):
auc_value=0
#auc第二种方法是通过fpr,tpr,通过auc(fpr,tpr)来计算AUC
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_scores, pos_label=1)
auc_value= auc(fpr,tpr) ###计算auc的值
#print("fpr:",fpr)
#print("tpr:",tpr)
#print("thresholds:",thresholds)
if auc_value<0.5:
auc_value=1-auc_value
return auc_value
def Draw_roc(auc_value):
fpr, tpr, thresholds = metrics.roc_curve(y, list_score, pos_label=0)
#画对角线
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Diagonal line')
plt.plot(fpr,tpr,label='ROC curve (area = %0.2f)' % auc_value)
plt.title('ROC curve')
plt.legend(loc="lower right")
#评价AUC表现
def AUC_performance(AUC):
if AUC >=0.7:
print("good classifier")
if 0.7>AUC>0.6:
print("not very good classifier")
if 0.6>=AUC>0.5:
print("useless classifier")
if 0.5>=AUC:
print("bad classifier,with sorting problems")
#Auc验证,数据采用测试集数据
auc_value=AUC(y, list_score)
print("AUC:",auc_value)
#评价AUC表现
AUC_performance(auc_value)
#绘制ROC曲线
Draw_roc(auc_value)
print(y_pred)
plt.show()
结果
max index 235
A: 6.7807190511263755
B: 14.426950408889635
variable_names coef
0 Status of existing \nchecking account 0.464295
1 Duration in month -0.028331
2 Credit history 0.266643
3 Purpose -0.058816
4 Credit amount -0.000068
5 Savings account 0.188856
6 Present employment 0.053853
7 Installment rate in percentage \nof disposable... -0.187075
8 Personal status and sex 0.071832
9 Other debtors 0.030951
10 Present residence 0.032202
11 Property -0.112172
12 Age -0.004556
13 Other installment plans 0.108087
14 Housing -0.000092
15 Number of existing \ncredits at this bank 0.013457
16 Job -0.012922
17 Number of people being liable\n to provide mai... -0.031617
18 Telephone 0.022233
19 foreign worker 0.021205
model accuracy is: 0.7475
model precision is: 0.7655786350148368
model sensitivity is: 0.9214285714285714
f1_score: 0.8363047001620746
AUC: 0.7824952380952381
good classifier
[1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1
0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1
0 1 1 1 0 0 0 0 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1
1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 1 0 1 1
1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1
1 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1
1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1
1 1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1]
ROC曲线
ROC曲线如下: