概念笔记:
多分类逻辑回归代码实现:
"""
多类分类
使用逻辑回归来识别手写数字0-9
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat # 用于读取matlab数据文件
# 准备数据
data = loadmat('ex3data1.mat')
# print(data)
# print(data['X'].shape, data['y'].shape) # (5000, 400) (5000, 1)
# 第一个任务:将逻辑回归实现完全向量化
# sigmoid函数
def sigmoid(z):
return 1 / (1 + np.exp(-z))
# 代价函数
def cost(theta, X, y, learningRate):
"""
:param theta: 参数, shape [1 + feature_num, ]
:param X: 输入特征, shape [sample_num, 1 + feature_num]
:param y: 输出类别, shape [sample_num, ]
:param learningRate: 正则化参数, scalar
:return: whole_cost: 损失值, scalar
"""
# 将theta、X、y转为numpy矩阵
theta = np.array(theta) # shape [1 + feature_num, ]
X = np.array(X) # shape [sample_num, 1 + feature_num]
y = np.array(y) # shape [sample_num, ]
# 根据公式计算损失函数(不含正则化)
predict = sigmoid(X @ theta.T) # shape [sample_num, ]
cross_cost = -y * np.log(predict) - (1 - y) * np.log(1 - predict)
# cross_cost: shape [sample_num, ]
# 根据公式计算损失函数中的正则化部分
reg = (learningRate / (2 * len(X))) * np.sum(np.power(theta[1:], 2)) # scalar
# 将上述结果相加
whole_cost = np.sum(cross_cost) / len(X) + reg # scalar
return whole_cost
# theta = np.zeros([1 + 400, ])
# X = np.insert(data['X'], 0, values=np.ones([5000, ]), axis=1)
# y_0 = np.array([1 if label == 1 else 0 for label in data['y'].ravel()]).reshape([len(X), ])
# print(cost(theta, X, y_0, 1))
# 向量化的梯度下降法
def gradient(theta, X, y, learningRate):
"""
:param theta: 参数, shape [1 + feature_num, ]
:param X: 输入特征, shape [sample_num, 1 + feature_num]
:param y: 输出类别, shape [sample_num, ]
:param learningRate: 学习率, scalar
:return: dtheta: 梯度,shape [1 + feature_num, ]
"""
# 将theta、X、y转为numpy矩阵
theta = np.array(theta) # shape [1 + feature_num, ]
X = np.array(X) # shape [sample_num, 1 + feature_num]
y = np.array(y) # shape [sample_num, ]
# 获得要更新的参数量
parameters = int(theta.ravel().shape[0]) # ravel降维打击,将所有元素降到一维
# 计算预测的误差
predict = sigmoid(np.dot(X, theta.T)) # shape [sample_num, ]
error = predict - y # shape [sample_num, ]
# 根据error计算梯度
dtheta = np.dot(error, X).reshape(theta.shape) / len(X) + (learningRate / len(X)) * theta
# dtheta: shape [1 + feature_num, ]
# 由于b不需要正则化,需要重置b的梯度
dtheta[0] -= (learningRate / len(X)) * theta[0]
return dtheta # shape [1 + feature_num, ]
# 基于上面定义的代价函数和梯度函数,接下来构建模型训练的函数。
# 该任务有10个可能的类,并且逻辑回归一次只能在2类间进行分类,
# 采用的解决办法是训练10个分类器,每个分类器在“是类别i”和“不是类别i”之间进行分类,i=1...10
# 把10个分类器的训练过程集成在一个函数中,该函数计算10个分类器最终权重,返回为10*(1+feature_num)的numpy数组
from scipy.optimize import minimize
def one_vs_all(X, y, num_labels, learning_rate):
"""
:param X: 输入特征, shape [sample_num, feature_num]
:param y: 输出类别, shape [sample_num, ]
:param num_labels: 标签数, scalar
:param learning_rate: 学习率, scalar
:return: all_theta, shape [num_labels, 1 + feature_num]
"""
rows = X.shape[0] # sample_num
params = X.shape[1] # feature_num
# 初始化权重参数为0
all_theta = np.zeros([num_labels, 1 + params])
# 在X的最左边插入一列1,表示x0
X = np.insert(X, 0, values=np.ones([rows, ]), axis=1) # shape [sample_num, 1 + feature_num]
# 训练阶段,训练10个分类器
for i in range(1, num_labels + 1):
theta = np.zeros([1 + params, ]) # shape [1 + feature_num, ]
y_i = np.array([1 if label[0] == i else 0 for label in y]).reshape([rows, ]) # shape [sample_num, ]
# 最小化目标函数(代价函数)
# X:shape [sample_num, 1 + feature_num], y_i:shape [sample_num, ]
fmin = minimize(fun=cost, x0=theta, args=(X, y_i, learning_rate), method='TNC', jac=gradient)
# scipy.minimize会自动把theta转为[1+feature_num,]的shape,所以gradient的输出也需要是[1+feature_num,]
all_theta[i - 1, :] = fmin.x
return all_theta # shape [num_labels, 1 + feature_num]
all_theta = one_vs_all(X=data['X'], y=data['y'], num_labels=10, learning_rate=1)
# print(all_theta)
# 使用训练完毕的分类器预测每个图像的标签。需要计算每个类的类概率,输出对应样本的最高概率的类标签
def predict_all(X, all_theta):
"""
:param X: 输入, shape: [sample_num, feature_num]
:param all_theta: 10个分类器的参数矩阵, shape [10, 1 + feature_num]
:return: h_argmax: 所有样本预测结果, shape [sample_num, 1]
"""
# 获取输入矩阵的维度信息
rows = X.shape[0] # sample_num
params = X.shape[1] # feature_num
num_labels = all_theta.shape[0] # num_labels
# 把矩阵X加入一列0元素
# X = np.insert(X, [0], values=np.ones([rows, 1]), axis=1)
X = np.insert(X, 0, values=np.ones([rows, ]), axis=1) # shape [sample_num, 1 + feature_num]
# 把矩阵X和all_theta转换为numpy矩阵
X = np.array(X).reshape([rows, 1 + params]) # shape [sample_num, 1 + feature_num]
all_theta = np.array(all_theta).reshape([num_labels, 1 + params]) # shape [num_labels, 1 + feature_num]
# 计算样本属于每一类的概率
h = sigmoid(np.dot(X, all_theta.T)) # shape [sample_num, num_labels]
# 找到每个样本中预测概率最大的值对应的索引
h_argmax = np.argmax(h, axis=1) # shape [sample_num, 1]
# 索引+1才是对应的预测标签
h_argmax = h_argmax + 1 # shape [sample_num, 1]
return h_argmax # shape [sample_num, 1]
y_pred = predict_all(data['X'], all_theta) # shape [sample_num, ]
correct = [1 if a == b else 0 for (a, b) in zip(y_pred, data['y'])] # shape [sample_num, ]
accuracy = (sum(map(int, correct))) / float(len(correct))
print('accuracy = {}%'.format(accuracy * 100))