# coding=utf-8

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import label

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mean_squared_error
from sklearn import metrics

'''
每年高中生和大学生都会申请进入到各种各样的高校中去。每个学生都有一组唯一的考试分数,
成绩和背景数据。录取委员会根据这个数据决定是否接受这些申请者。
在这种情况下一个二元分类算法可用于接受或拒绝申请,逻辑回归是个不错的方法。

gre - Graduate Record Exam(研究生入学考试), a generalized test for prospective graduate students(一个通用的测试未来的研究生), continuous between 200 and 800.
gpa - Cumulative grade point average(累积平均绩点), continuous between 0.0 and 4.0.
admit - Binary variable, 0 or 1, where 1 means the applicant was admitted to the program.
'''
'''
线性回归和逻辑回归的区别:
线性回归是预测值,
逻辑回归的预测某一类的概率值

'''

def logit(x):
return np.exp(x) / (1 + np.exp(x))



#逻辑回归函数图像
def find_logit_():
t = np.linspace(-6, 6, 50, dtype=float)
ylogit = logit(t)
plt.plot(t, ylogit,label='logistic')
plt.ylabel("Probability")
plt.xlabel("t")
plt.title("Logistic Function")
plt.show()

a = logit(-10)
b = logit(10)
'''
a:4.5397868702434395e-05
b:0.99995460213129761
'''

def test_plt(admissions):

plt.scatter(admissions["gpa"], admissions["admit"])
plt.show()

#用逻辑回归预测类别
def logit_stand(admissions):

logistic_model = LogisticRegression()
logistic_model.fit(admissions[['gpa']],admissions[['admit']])
pre = logistic_model.predict(admissions[['gpa']])

mse = mean_squared_error(admissions[['admit']],pre)

plt.scatter(admissions[['gpa']], pre)

plt.show()

#predict_proba 预测概率值
def logit_pro(admissions):
logistic_model = LogisticRegression()
logistic_model.fit(admissions[['gpa']], admissions[['admit']])
#得到逻辑回归预测的概率值 l列可能和0列不可能 两列
pre = logistic_model.predict_proba(admissions[['gpa']])

plt.scatter(admissions[['gpa']], pre[:,1])

plt.show()

#模型准确度的预测
def logit_admin(admissions):
logistic_model = LogisticRegression()
logistic_model.fit(admissions[['gpa']], admissions[['admit']])

labels = logistic_model.predict(admissions[['gpa']])

admissions['predicted_label'] = labels

#print(admissions['predicted_label'].value_counts())

admissions['actual_label'] = admissions[['admit']]

matches = admissions['predicted_label'] == admissions['actual_label']

correct_predictions = admissions[matches]

accuracy = len(correct_predictions) / len(admissions)*1.0
print('预测的精度:')
print(accuracy)

#非均衡分类问题
#检测正例效果
#现实中,在测试集中预测
#TP
true_positive_filter = (admissions['predicted_label'] == 1) & (admissions['actual_label'] == 1)
true_positives = len(admissions[true_positive_filter])

#TN
true_negatives_filter = (admissions['predicted_label'] == 0) & (admissions['actual_label'] == 0)
true_negatives = len(admissions[true_negatives_filter])

#FN
false_negatives_filter = (admissions['predicted_label'] == 0) & (admissions['actual_label'] == 1)
false_negatives = len(admissions[false_negatives_filter])

senditivity = true_positives / float((true_positives + false_negatives))

print(senditivity)



def test_train(admissions):

np.random.seed(8)

admissions['actual_label'] = admissions[['admit']]

#删除某行或某列
admissions = admissions.drop('admit', axis=1)

#np.random.shuffle() 没有返回值 permutation有返回打乱的index
shffle_admissions = np.random.permutation(admissions.index)

shuffled_admissions = admissions.loc[shffle_admissions]

train = shuffled_admissions.iloc[0:515]
test = shuffled_admissions.iloc[515:len(shuffled_admissions)]

logistic_model = LogisticRegression()
logistic_model.fit(train[['gpa']], train[['actual_label']])

labels = logistic_model.predict(test[['gpa']])

test['predicted_label'] = labels

matches = test['predicted_label'] == test['actual_label']

correct_predictions = test[matches]

accuracy = len(correct_predictions) / len(test)*1.0
#print('预测的精度:')
#print(accuracy)



'''
ROC曲线:

roc_curve (真是的label,是一个概率值(某一列))
返回值 : FP,TP
'''
probabilities = logistic_model.predict_proba(test[['gpa']])
fpr, tpr, thresholds = metrics.roc_curve(test['actual_label'], probabilities[:,1])

#计算ROC曲线的面积,对模型的总和评判
area = metrics.roc_auc_score(test['actual_label'], probabilities[:, 1])

plt.plot(fpr, tpr)
plt.show()


if __name__ == '__main__':
admissions = pd.read_csv("admissions.csv")

#find_logit_()

#logit_stand(admissions)

#logit_pro(admissions)

#logit_admin(admissions)

test_train(admissions)


# coding=utf-8


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
import numpy
//线性回归
columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name']
cars = pd.read_table('auto-mpg.data',delim_whitespace=True, names=columns)

#导入线性模型
lr = LinearRegression(fit_intercept=True)

#对模型的数据进行训练
# 第一个参数: 样本矩阵
# 第二个参数;label(列向量 单标签) (矩阵 多标签)
lr.fit(cars[['weight']], cars['mpg'])

#预测v
predictions = lr.predict(cars[['weight']])

#print(predictions[0:5])
#print(cars['mpg'][0:5])


#均方误差
'''
均方误差 = 累加和{(预测值 - 真实值)^2}/个数
'''
mse = mean_squared_error(cars['mpg'],predictions)
print(mse)

#标准差
rmse = mse**0.5

plt.scatter(cars[['weight']], cars['mpg'], c='red')
plt.scatter(cars[['weight']], predictions, c='blue')

plt.show()



# coding=utf-8


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

'''
数据集:
mpg:跑公里
cylinders : 气缸数
displacement:
horsepower: 马力
weight: 重
acceleration:加速度
model year: 生产年
origin: 生产地
'''


'''
用逻辑回归解决多分类问题:
对于三个类别A,B,C,需要进行分解,对问题进行分类,
首先把A作为正例,把BC放在一起
其次 B作为正例,把AC放在一起
再者 C作为正例,把AB放在一起
分别求三者的概率,求出最大值


步骤:
1.读取数据,并把数据洗牌
2.进行数据集和测试集的划分
3.依照类别的个数,对每一中类别进行逻辑回归
4.求出几种类别的最大值,构建权重向量

'''


columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name']
cars = pd.read_table('auto-mpg.data',delim_whitespace=True, names=columns)

#prefix被分类特征的前缀名 作用: prefix的值 + _ + 每一个数据 组成的列名
cummy_cylinders = pd.get_dummies(cars['cylinders'], prefix='cyl')

#concat 把cars列与cummy_cylinders列连在一起
cars = pd.concat([cars, cummy_cylinders], axis=1)

cummy_years = pd.get_dummies(cars['model year'], prefix='year')
cars = pd.concat([cars, cummy_years], axis=1)

cars = cars.drop('model year', axis=1)
cars = cars.drop('cylinders', axis=1)



#permutation 数据的下标进行洗牌
shuffled_rows = np.random.permutation(cars.index)
shuffed_cars = cars.iloc[shuffled_rows]
highest_train_row = int(cars.shape[0] * 0.7)

train = shuffed_cars.iloc[0:highest_train_row]
test = shuffed_cars.iloc[highest_train_row:]



#取出汽车产地的分类 [1,2,3]
unique_origins = cars['origin'].unique()

unique_origins.sort()

models = {}
features = [c for c in train.columns if c.startswith('cyl') or c.startswith('year')]
#print(features)


for origin in unique_origins:

model = LogisticRegression()
X_train = train[features]
y_train = train['origin'] == origin

model.fit(X_train, y_train)

models[origin] = model #保存每一个样例为正例的结果


testing_probs = pd.DataFrame(columns=unique_origins)
for origin in unique_origins:
#选择测试的特征
X_test = test[features]

testing_probs[origin] = models[origin].predict_proba(X_test)[:,1 ]



predicted_origins = testing_probs.idxmax(axis=1)
print(predicted_origins)