Author:LieDra

前言

下面将对数据利用Logistic得到结果。

逻辑回归算法介绍

对一些现有的数据点进行值的拟合过程,就叫做回归。

逻辑回归虽然带回归二字,但一般并不算是用于回归,更多的还是偏向于分类。

需要找出一个预测函数模型,使其值输出在[0,1]之间。然后选择一个基准值,如0.5,如果算出来的预测值大于0.5,就认为其预测值为1,反之则其预测值为0。

这里的预测函数一般就是Logistic函数(也叫Sigmoid函数)。

考虑样本的预测值与真实值的误差,逻辑回归中常用的损失函数是自然对数函数。

逻辑回归模型参数可以通过梯度下降算法来求。
使用Sklearn时,我们并不需要考虑那么多的细节,只需要调整好部分参数即可。

代码

'''
version:
author:LieDra
Method:Logistic Regression--非标准化数据
'''
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split #划分数据
from sklearn.linear_model.logistic import LogisticRegression    #逻辑回归
# import statsmodels.api as sm
#混淆矩阵计算
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

df_german=pd.read_excel("D:/study/5/code/python/python Data analysis and mining/class/dataset/german.xls")
y=df_german.ix[:,-1]
x=df_german.ix[:,:-1]
l1 = []
for i in range(1000):
    print('****'*50)
    print('第',i+1,'次test')
    # X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
    x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,train_size=0.6,random_state=i+1)
    x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=i+1)
    classifier = LogisticRegression()
    classifier.fit(x_train, y_train)
# predictions = classifier.predict(X_test)

#验证
    print("accuracy on the training subset:{:.3f}".format(classifier.score(x_train,y_train)))
    print("accuracy on the check subset:{:.3f}".format(classifier.score(x_check,y_check)))
    l1.append(classifier.score(x_check,y_check))

print('max index',l1.index(max(l1))+1)
 
 
#得分公式
'''
P0 = 50
PDO = 10
theta0 = 1.0/20
B = PDO/np.log(2)
A = P0 + B*np.log(theta0)
'''
def Score(probability):
    #底数是e
    score = A-B*np.log(probability/(1-probability))
    return score
#批量获取得分
def List_score(pos_probablity_list):
    list_score=[]
    for probability in pos_probablity_list:
        score=Score(probability)
        list_score.append(score)
    return list_score
 
P0 = 50
PDO = 10
theta0 = 1.0/20
B = PDO/np.log(2)
A = P0 + B*np.log(theta0)
print("A:",A)
print("B:",B)
list_coef = list(classifier.coef_[0])
intercept= classifier.intercept_      
# g(x) = w1x1 + w2x2 + w3x3 + w4x4 + w0  则coef_表示的是w1----w4,intercept_表示
 
#获取所有x数据的预测概率,包括好客户和坏客户,0为好客户,1为坏客户
probablity_list=classifier.predict_proba(x) # 输出分类概率。返回每种类别的概率,按照分类类别顺序给出。
#获取所有x数据的坏客户预测概率
pos_probablity_list=[i[1] for i in probablity_list]
#获取所有客户分数
list_score=List_score(pos_probablity_list)
list_predict=classifier.predict(x)
df_result=pd.DataFrame({"label":y,"predict":list_predict,"pos_probablity":pos_probablity_list,"score":list_score})
 
# df_result.to_excel("score_proba.xlsx")
# 打印结果
print(df_result)
 
#变量名列表
list_vNames=df_german.columns
#去掉第一个变量名target
list_vNames=list_vNames[0:-1]
df_coef=pd.DataFrame({"variable_names":list_vNames,"coef":list_coef})

# df_coef.to_excel("coef.xlsx")
# 打印变量相关度
print(df_coef)


y_true=y_test
y_pred=classifier.predict(x_test)
accuracyScore = accuracy_score(y_true, y_pred)
print('model accuracy is:',accuracyScore)
 
#precision,TP/(TP+FP) (真阳性)/(真阳性+假阳性)
precision=precision_score(y_true, y_pred)
print('model precision is:',precision)
 
#recall(sensitive)敏感度,(TP)/(TP+FN)
sensitivity=recall_score(y_true, y_pred)
print('model sensitivity is:',sensitivity)

#F1 = 2 x (精确率 x 召回率) / (精确率 + 召回率)
#F1 分数会同时考虑精确率和召回率,以便计算新的分数。可将 F1 分数理解为精确率和召回率的加权平均值,其中 F1 分数的最佳值为 1、最差值为 0:
f1Score=f1_score(y_true, y_pred)
print("f1_score:",f1Score)
 
def AUC(y_true, y_scores):
    auc_value=0
    #auc第二种方法是通过fpr,tpr,通过auc(fpr,tpr)来计算AUC
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_scores, pos_label=1)
    auc_value= auc(fpr,tpr) ###计算auc的值
    #print("fpr:",fpr)
    #print("tpr:",tpr)
    #print("thresholds:",thresholds)
    if auc_value<0.5:
        auc_value=1-auc_value
    return auc_value
 
def Draw_roc(auc_value):
    fpr, tpr, thresholds = metrics.roc_curve(y, list_score, pos_label=0)
    #画对角线
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Diagonal line')
    plt.plot(fpr,tpr,label='ROC curve (area = %0.2f)' % auc_value)
    plt.title('ROC curve') 
    plt.legend(loc="lower right")
 
#评价AUC表现
def AUC_performance(AUC):
    if AUC >=0.7:
        print("good classifier")
    if 0.7>AUC>0.6:
        print("not very good classifier")
    if 0.6>=AUC>0.5:
        print("useless classifier")
    if 0.5>=AUC:
        print("bad classifier,with sorting problems")
         
#Auc验证,数据采用测试集数据
auc_value=AUC(y, list_score)
print("AUC:",auc_value)
#评价AUC表现
AUC_performance(auc_value)
#绘制ROC曲线
Draw_roc(auc_value)

print(y_pred)
plt.show()

结果

max index 235
A: 6.7807190511263755
B: 14.426950408889635
                                       variable_names      coef
0              Status of existing \nchecking account   0.464295
1                                  Duration in month  -0.028331
2                                      Credit history  0.266643
3                                            Purpose  -0.058816
4                                       Credit amount -0.000068
5                                     Savings account  0.188856
6                                  Present employment  0.053853
7   Installment rate in percentage \nof disposable... -0.187075
8                             Personal status and sex  0.071832
9                                       Other debtors  0.030951
10                                  Present residence  0.032202
11                                          Property  -0.112172
12                                                Age -0.004556
13                           Other installment plans   0.108087
14                                           Housing  -0.000092
15         Number of existing \ncredits at this bank   0.013457
16                                               Job  -0.012922
17  Number of people being liable\n to provide mai... -0.031617
18                                         Telephone   0.022233
19                                    foreign worker   0.021205
model accuracy is: 0.7475
model precision is: 0.7655786350148368
model sensitivity is: 0.9214285714285714
f1_score: 0.8363047001620746
AUC: 0.7824952380952381
good classifier
[1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1
 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1
 0 1 1 1 0 0 0 0 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 1 0 1 1
 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1
 1 1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1]

ROC曲线

ROC曲线如下:

logistic回归临床预测模型 R语言 logistic回归分析预测_数据