# coding=utf-8
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import label
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn import metrics
'''
每年高中生和大学生都会申请进入到各种各样的高校中去。每个学生都有一组唯一的考试分数,
成绩和背景数据。录取委员会根据这个数据决定是否接受这些申请者。
在这种情况下一个二元分类算法可用于接受或拒绝申请,逻辑回归是个不错的方法。
gre - Graduate Record Exam(研究生入学考试), a generalized test for prospective graduate students(一个通用的测试未来的研究生), continuous between 200 and 800.
gpa - Cumulative grade point average(累积平均绩点), continuous between 0.0 and 4.0.
admit - Binary variable, 0 or 1, where 1 means the applicant was admitted to the program.
'''
'''
线性回归和逻辑回归的区别:
线性回归是预测值,
逻辑回归的预测某一类的概率值
'''
def logit(x):
return np.exp(x) / (1 + np.exp(x))
#逻辑回归函数图像
def find_logit_():
t = np.linspace(-6, 6, 50, dtype=float)
ylogit = logit(t)
plt.plot(t, ylogit,label='logistic')
plt.ylabel("Probability")
plt.xlabel("t")
plt.title("Logistic Function")
plt.show()
a = logit(-10)
b = logit(10)
'''
a:4.5397868702434395e-05
b:0.99995460213129761
'''
def test_plt(admissions):
plt.scatter(admissions["gpa"], admissions["admit"])
plt.show()
#用逻辑回归预测类别
def logit_stand(admissions):
logistic_model = LogisticRegression()
logistic_model.fit(admissions[['gpa']],admissions[['admit']])
pre = logistic_model.predict(admissions[['gpa']])
mse = mean_squared_error(admissions[['admit']],pre)
plt.scatter(admissions[['gpa']], pre)
plt.show()
#predict_proba 预测概率值
def logit_pro(admissions):
logistic_model = LogisticRegression()
logistic_model.fit(admissions[['gpa']], admissions[['admit']])
#得到逻辑回归预测的概率值 l列可能和0列不可能 两列
pre = logistic_model.predict_proba(admissions[['gpa']])
plt.scatter(admissions[['gpa']], pre[:,1])
plt.show()
#模型准确度的预测
def logit_admin(admissions):
logistic_model = LogisticRegression()
logistic_model.fit(admissions[['gpa']], admissions[['admit']])
labels = logistic_model.predict(admissions[['gpa']])
admissions['predicted_label'] = labels
#print(admissions['predicted_label'].value_counts())
admissions['actual_label'] = admissions[['admit']]
matches = admissions['predicted_label'] == admissions['actual_label']
correct_predictions = admissions[matches]
accuracy = len(correct_predictions) / len(admissions)*1.0
print('预测的精度:')
print(accuracy)
#非均衡分类问题
#检测正例效果
#现实中,在测试集中预测
#TP
true_positive_filter = (admissions['predicted_label'] == 1) & (admissions['actual_label'] == 1)
true_positives = len(admissions[true_positive_filter])
#TN
true_negatives_filter = (admissions['predicted_label'] == 0) & (admissions['actual_label'] == 0)
true_negatives = len(admissions[true_negatives_filter])
#FN
false_negatives_filter = (admissions['predicted_label'] == 0) & (admissions['actual_label'] == 1)
false_negatives = len(admissions[false_negatives_filter])
senditivity = true_positives / float((true_positives + false_negatives))
print(senditivity)
def test_train(admissions):
np.random.seed(8)
admissions['actual_label'] = admissions[['admit']]
#删除某行或某列
admissions = admissions.drop('admit', axis=1)
#np.random.shuffle() 没有返回值 permutation有返回打乱的index
shffle_admissions = np.random.permutation(admissions.index)
shuffled_admissions = admissions.loc[shffle_admissions]
train = shuffled_admissions.iloc[0:515]
test = shuffled_admissions.iloc[515:len(shuffled_admissions)]
logistic_model = LogisticRegression()
logistic_model.fit(train[['gpa']], train[['actual_label']])
labels = logistic_model.predict(test[['gpa']])
test['predicted_label'] = labels
matches = test['predicted_label'] == test['actual_label']
correct_predictions = test[matches]
accuracy = len(correct_predictions) / len(test)*1.0
#print('预测的精度:')
#print(accuracy)
'''
ROC曲线:
roc_curve (真是的label,是一个概率值(某一列))
返回值 : FP,TP
'''
probabilities = logistic_model.predict_proba(test[['gpa']])
fpr, tpr, thresholds = metrics.roc_curve(test['actual_label'], probabilities[:,1])
#计算ROC曲线的面积,对模型的总和评判
area = metrics.roc_auc_score(test['actual_label'], probabilities[:, 1])
plt.plot(fpr, tpr)
plt.show()
if __name__ == '__main__':
admissions = pd.read_csv("admissions.csv")
#find_logit_()
#logit_stand(admissions)
#logit_pro(admissions)
#logit_admin(admissions)
test_train(admissions)