文章目录
- 逻辑回归
- 正则化逻辑回归
逻辑回归
题目描述
设想你是某大学相关部分的管理者,想通过申请学生两次测试的评分,来决定他们是否被录取。现在你拥有之前申请学生的可以用于训练逻辑回归的训练样本集。对于每一个训练样本,你有他们两次测试的评分和最后是被录取的结果。可以准备构建一个基于两次测试评分来评估录取可能性的分类模型来完成这个预测任务。
导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
读取数据并展示
path = r'E:\Master\ML\homework\02 Logistic Regression\ex2data1.txt'
#names添加列名,header用指定的行来作为标题,对无表头的数据,则需设置header=None,否则第一行数据被作为表头
data = pd.read_csv(path,header=None,names=['Exam1','Exam2','Admitted'])
data.head()
print(data.head())
#describe()快速统计汇总数据内容
data.describe()
print(data.describe())
绘制散点图,可视化理解数据
#绘制训练集中的样本数据,positive表示接受,negative表示未接受
#pandas.isin用来清洗数据,过滤某些行,或者选出想要的某些行
positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]
fig1, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['Exam1'],positive['Exam2'],s=50,c='b',marker='o',label='Admitted')
ax.scatter(negative['Exam1'],negative['Exam2'],s=50,c='r',marker='x',label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam1 Score')
ax.set_ylabel('Exam2 Score')
plt.show()
定义sigmoid函数并测试
#定义sigmoid函数
def sigmoid(z):
#np. exp(x) e的x幂次方
#当np.exp()的参数传入的是一个向量时,
#其返回值是该向量内所有元素值分别进行e^{x} 求值后的结果,所构成的一个列表返回给调用处
return 1/(1+np.exp(-z))
#测试上述函数
#arange函数通过指定初始值、终值和步长创建等差一维数组,创建的数组不包含终值
nums = np.arange(-10,10,0.5)
fig2,ax = plt.subplots(figsize=(12,8))
ax.plot(nums,sigmoid(nums),c='r')
plt.show()
数据预处理
#数据预处理
data.insert(0,'ones',1)
#cols = data.shape[1] #列数
#X = data.iloc[:,:cols-1] #取前cols-1列作为输入向量
#y = data.iloc[:,cols-1:cols] #取最后一列作为目标向量
#print(X.head())
#print(y.head())
#X = np.matrix(X.values)
#y = np.matrix(y.values)
X = data.iloc[:,0:-1].values
y = data.iloc[:,-1].values
theta = np.zeros(3)
#检查矩阵的维度
print(X.shape,y.shape,theta.shape)
print(theta)
定义代价函数
#定义代价函数
def cost(theta,X,y):
first = y * np.log(sigmoid(X@theta.T))
second = (1 - y) * np.log(1 - sigmoid(X@theta.T))
return -1 * np.mean(first + second)
#计算初始代价函数的值
cost(theta,X,y)
print(cost(theta,X,y))
计算步长
def gradient(theta,X,y):
return (1/len(X) * X.T @ (sigmoid(X@theta.T) - y))
gradient(theta,X,y)
print(gradient(theta,X,y))
定义梯度下降
# 梯度下降
def gradientDesent(theta, X, y,alpha,iters):
#初始化一个ndarrsy,包含每次训练后的cost
costs = np.zeros(iters)
temp = np.zeros(len(theta))
for i in range(iters): # 迭代
temp = theta - alpha*gradient(theta,X,y)
theta = temp
costs[i] = cost(theta,X,y)
return theta,costs
theta,costs = gradientDesent(theta, X, y,0.001,1000)
print(theta)
print(costs[-1])
损失函数图像
#损失函数图像
fig3,ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(1000),costs,c = 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Costs')
ax.set_title('Error vs. Iteration')
plt.show()
高级优化算法
#直接调用scipy.optimize,fmin_tnc或.minimize找到最优解
import scipy.optimize as opt
#fmin_tnc() 有约束的多元函数问题,提供梯度信息,使用截断牛顿法
#func:优化的目标函数 x0:初值 fprime:优化函数的梯度函数 args:传递给优化函数的参数
#返回:优化函数目标值,整数,function evaluations的数目
#在进行优化的时候,每当目标优化函数被调用一次,就算一个function evaluation。在一次迭代过程中会有多次function evaluation
#这个参数不等同于迭代次数,往往大于迭代次数
result1 = opt.fmin_tnc(func=cost,x0=theta,fprime=gradient,args=(X,y))
cost(result1[0], X, y)
#print(result1)
#print(cost(result1[0], X, y))
#fun:损失函数(定义时,theta必须是第一个参数且是一维数组)
#x0:初始化的theta,必须是一维数组 args:计算损失函数时用到的其他参数,以元组的形式传入
#method:采用的算法 Newton-CG 牛顿共轭梯度算法 jac:计算梯度的函数(第一个参数必须是theta且为一维数组)
result2 = opt.minimize(fun=cost, x0=np.array(theta),
args=(X, np.array(y)), method='Newton-CG', jac=gradient)
cost(result2.x, X, y)
print(result2)
print(cost(result2.x, X, y))
预测
def predict(theta,X):
probability = sigmoid(X @ theta.T)
return [1 if x >= 0.5 else 0 for x in probability]
模型准确率
#theta_min = np.matrix(result1[0])
theta_min = np.matrix(result2.x)
predictions = predict(theta_min,X)
correct = [1 if a^b == 0 else 0 for(a,b) in zip(predictions,y)]
accuracy = (sum(correct) / len(correct))
print('accuracy = {0:.0f}%'.format(accuracy*100))
模型分类指标
from sklearn.metrics import classification_report
classification_report(y,predictions)
#print(classification_report(y,predictions))
决策边界
coef = -result2.x/result2.x[2]
x = np.arange(30,100,0.5)
y = coef[0] + coef[1] * x
fig4,ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['Exam1'],positive['Exam2'],s=50,c='b',marker='o',label='Admitted')
ax.scatter(negative['Exam1'],negative['Exam2'],s=50,c='r',marker='x',label='Not Admitted')
ax.plot(x,y,label='Decision Boundary',c='grey')
ax.legend()
ax.set_xlabel('Exam1 Score')
ax.set_ylabel('Exam2 Score')
plt.show()
正则化逻辑回归
题目描述
在质量认证(QA)过程中,每个微芯片都要经过各种测试,以确保其正常运行。假设您是工厂的产品经理,在两个不同的测试中有一些微芯片的测试结果。从这两项测试中,您想确定该微芯片是否应该被接受或拒绝。为了帮助您做出决定,您有一个关于过去微芯片测试结果的数据集,从中您可以建立一个正则化逻辑回归模型来预测来自制造厂的微芯片是否通过QA。
导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
读取数据并展示
path = r'E:\Master\ML\homework\02 Logistic Regression\ex2data2.txt'
data = pd.read_csv(path,header=None,names=['Test1','Test2','Accepted'])
data.head()
print(data.head())
data.describe()
print(data.describe())
绘制散点图,可视化理解数据
positive = data[data['Accepted'].isin([1])]
negative = data[data['Accepted'].isin([0])]
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['Test1'],positive['Test2'],s=50,c='b',marker='o',label='Accepted')
ax.scatter(negative['Test1'],negative['Test2'],s=50,c='r',marker='x',label='Rejected')
ax.legend()
ax.set_xlabel('Test1 Score')
ax.set_ylabel('Test2 Score')
plt.show()
从图像中可以看出,不能直接使用直线将数据区分开。通过给每个数据点创造更多特征可以产生更加复杂的决策边界,从而更好地拟合数据。这里将特征映射到x1,x2的6次方的多项式,特征向量从3维变成28维。
特征映射
#power 生成的多项式特征的最高阶数,as_ndarray:是否返回一个数组,True返回数组,否则返回数据框
def feature_mapping(x,y,power,as_ndarray=False):
#定义一个空字典
data = {'f{0}{1}'.format(i-p,p):np.power(x,i-p)*np.power(y,p)
for i in range(0,power+1)
for p in range(0,i+1)
}
if as_ndarray:
return pd.DataFrame(data).values
else:
return pd.DataFrame(data)
x1 = data.Test1.values
x2 = data.Test2.values
y = data.Accepted
data = feature_mapping(x1,x2,power=6)
print(data.head())
print(data.describe())
定义sigmoid函数
def sigmoid(z):
return 1/(1+np.exp(-z))
定义代价函数和正则化代价函数
#定义代价函数
def cost(theta,X,y):
first = y * np.log(sigmoid(X@theta.T))
second = (1 - y ) * np.log(1-sigmoid(X@theta.T))
return -1 * np.mean(first+second)
#定义正则化代价函数
def regularized_cost(theta,X,y,l=1):
theta_1n = theta[1:]
regularized_term = l / (2 * len(X)) * np.power(theta_1n,2).sum()
return cost(theta,X,y) + regularized_term
#计算初始代价函数
print(cost(theta,X,y))
#计算正则化代价函数
print(regularized_cost(theta,X,y))
计算梯度和正则化梯度
#定义梯度
def gradient(theta,X,y):
return (1/len(X)) * X.T @ (sigmoid(X @ theta.T) - y)
#定义正则化梯度
def regularized_gradient(theta,X,y,l=1):
theta_1n = theta[1:]
regularized_theta = l/len(X) * theta_1n
regularized_term = np.concatenate([np.array([0]),regularized_theta])
return gradient(theta,X,y) + regularized_term
print(gradient(theta,X,y))
print(regularized_gradient(theta,X,y))
高级优化算法
import scipy.optimize as opt
result = opt.minimize(fun=regularized_cost,x0=theta,args=(X,y),
method='Newton-CG',jac=regularized_gradient)
regularized_cost(result.x, X, y,l=1)
print(result)
print(regularized_cost(result.x, X, y,l=1))
result1 = opt.fmin_tnc(func=regularized_cost,x0=theta,fprime=regularized_gradient,args=(X,y))
regularized_cost(result.x, X, y,l=1)
print(result1[0])
print(regularized_cost(result.x, X, y,l=1))
预测
def predict(theta,X):
probability = sigmoid(X@theta.T)
return [1 if x >= 0.5 else 0 for x in probability]
模型分类指标
from sklearn.metrics import classification_report
y_pred = predict(result.x,X)
classification_report(y,y_pred)
#print(classification_report(y,y_pred))
决策边界
#决策边界
def find_decision_boundary(density, power, theta, threshhold):
t1 = np.linspace(-1, 1.2, density)
t2 = np.linspace(-1, 1.2, density)
cordinates = [(x, y) for x in t1 for y in t2]
x_cord, y_cord = zip(*cordinates)
mapped_cord = feature_mapping(x_cord, y_cord, power)
pred = mapped_cord.values @ theta.T
decision = mapped_cord[np.abs(pred) <= threshhold]
return decision.f10, decision.f01
# 画决策边界
def draw_boundary(power, l):
density = 2000
threshhold = 2 * 10 ** -3
res = opt.minimize(fun=regularized_cost, x0=theta, args=(X, y, l), method='TNC', jac=regularized_gradient)
theta1 = res.x
x1, y1 = find_decision_boundary(density, power, theta1, threshhold)
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(positive['Test1'], positive['Test2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative['Test1'], negative['Test2'], s=50, c='g', marker='x', label='Rejected')
ax.scatter(x1, y1, s=50, c='r', marker='.', label='Decision Boundary')
ax.legend()
ax.set_xlabel('Test1 Score')
ax.set_ylabel('Test2 Score')
plt.show()
draw_boundary(6, l=1)
欠拟合
draw_boundary(6, l=100)
过拟合
draw_boundary(6, l=0)