目录
- 评估方法
- 留出法
- 交叉验证法
- 自助法
- 性能度量
- 线性回归
- 一元最小二乘回归
- 多元最小二乘回归
- 逻辑回归(对数几率回归)
- 不包含最优化算法代码
- 完整代码(无需导入其他文件)
- LDA 线性判别分析
- 仅计算投影方向的函数
- LDA类
- 决策树算法
- ID3
- 神经网络
- 感知机
- 多层前馈神经网络(标准BP算法)
- 支持向量机
- 解决书上的一些疑问
- SMO算法
- 推导闭式解步骤
- 循环算法设计
- 硬间隔支持向量机
- 核函数思路
- 软间隔思路
- 聚类算法
- k-Means
评估方法
此处代码不直接调用 sklearn.model_selection,且仅支持pandas库的DataFrame对象
代码都在 python3.7 下经本人测试过,可按自己需求修改
留出法
import pandas as pd
import random
# from numpy import random
def train_test_split(X,Y=None,test_size=0.2):
length = len(X)
train_index = list(range(length))
test_index = random.sample(train_index,int(length*test_size))
for x in test_index:
train_index.remove(x)
if Y is not None:
return X.iloc[train_index],X.iloc[test_index],Y.iloc[train_index],Y.iloc[test_index]
else:
return X.iloc[train_index],X.iloc[test_index]
交叉验证法
import pandas as pd
import random
# from numpy import random
def cross_validation(X,Y=None,k=10):
'''
返回长度为k的列表
'''
folds_X = []
if Y is not None:
folds_Y = []
split_index = []
length = len(X)
tmp_index = list(range(length))
for i in range(k,0,-1):
split_index.append(random.sample(tmp_index,int(length/i)))
for x in split_index[-1]:
tmp_index.remove(x)
length -= 1
if Y is not None:
for index in split_index:
folds_X.append(X.iloc[index])
folds_Y.append(Y.iloc[index])
return folds_X,folds_Y
else:
for index in split_index:
folds_X.append(X.iloc[index])
return folds_X
自助法
import pandas as pd
import random
# from numpy import random
def bootstrapping(X,Y=None):
length = len(X)
train_index = []
test_index = list(range(length))
for i in range(length):
tmp = random.randint(0,length-1)
train_index.append(tmp)
if tmp in test_index:
test_index.remove(tmp)
if not len(test_index):
print('Warning: Bad luck is a horror!')
if Y is not None:
return X.iloc[train_index],X.iloc[test_index],Y.iloc[train_index],Y.iloc[test_index]
else:
return X.iloc[train_index],X.iloc[test_index]
性能度量
要将如下函数封装到类中时
请自行改动函数内的return语句为赋值给类的属性
def mean_squared_error(y,predict_y):
'''
回归 均方误差
'''
m = len(y)
ans = 0
for i in range(m):
ans += (y[i]-predict_y[i])**2
ans /= m
return ans
def er(y,predict_y):
'''
分类 错误率
'''
m = len(y)
ans = 0
for i in range(m):
if y[i] != predict_y[i]:
ans += 1
ans /= m
return ans
def acc(y,predict_y):
'''
分类 精度
'''
m = len(y)
ans = 0
for i in range(m):
if y[i] == predict_y[i]:
ans += 1
ans /= m
return ans
def precision(y,predict_y,positive=1,negative=0):
'''
二分类 查准率: 预测是正例且实际上就是正例的比例
数值上等于ROC曲线中的假正例率
'''
a = 0
b = 0
for i in range(len(y)):
if predict_y[i] == positive:
b += 1
if y[i] == positive:
a += 1
if b:
return a/b
else:
return 0
def recall(y,predict_y,positive=1,negative=0):
'''
二分类 召回率: 实际是正例且预测也是正例的比例
数值上等于ROC曲线中的真正例率
'''
a = 0
b = 0
for i in range(len(y)):
if y[i] == positive:
b += 1
if predict_y[i] == positive:
a += 1
if b:
return a/b
else:
return 0
def F_beta(y,predict_y,positive=1,negative=0,beta=1):
'''
二分类 F_beta(default:F1)
'''
a = 0
b1 = 0
b2 = 0
for i in range(len(y)):
t1 = int(predict_y[i] == positive)
t2 = int(y[i] == positive)
b1 += t1
b2 += t2
a += t1&t2
if b1 == 0 or b2 == 0 or a == 0:
return 0
p = a/b1
r = a/b2
return (1+beta**2)*p*r/(beta**2*p+r)
线性回归
一元最小二乘回归
import numpy as np
def least_square_method(X,Y):
'''
单变量最小二乘回归
X,Y 为一维数组/列表 返回浮点数 w,b
'''
m = len(X)
X = np.array(X)
Y = np.array(Y)
w = np.sum(Y*(X-np.average(X)))/(np.sum(X**2)-np.sum(X)**2/m)
b = np.sum(Y-w*X)/m
return w,b
if __name__ == '__main__':
w = 3
b = 1
x = [np.random.randint(1,10) for i in range(5)]
y = [w*x[i]+b+np.random.randn()/10 for i in range(5)]
w_p,b_p = least_square_method(x,y)
print('w_predict = %.2f'%w_p)
print('b_predict = %.2f'%b_p)
多元最小二乘回归
import numpy as np
from numpy import random
def multivariate_linear_regression(X,Y):
'''
多元最小二乘回归
要求样本数大于维数
Y可以是一维数组或二维数组
返回 array[w1, w2, ... , wd, b]
'''
X = np.mat(X)
Y = np.mat(Y)
m = len(X)
if len(Y) == 1:
Y = Y.T
X = np.c_[X,np.ones(m)]
w = np.linalg.inv(X.T*X)*X.T*Y
w = np.hstack(np.array(w))
return w
if __name__ == '__main__':
w = [random.randint(1,10) for i in range(5)]
b = random.randint(-5,5)
print('real w&b')
print(*w,b,sep='\t')
X = []
Y = []
for i in range(20):
X.append([])
Y.append(b+random.random()*4-2)
for j in range(5):
X[-1].append(random.random()*20-10)
Y[-1] += w[j]*X[-1][-1]
wp = multivariate_linear_regression(X,Y)
wp = map(lambda x:round(x,2),wp)
print('predict w&b')
print(*wp,sep='\t')
逻辑回归(对数几率回归)
看似逻辑回归只是线性回归加上一个特殊的联系函数(Sigmoid 函数)而已,实际上,因为训练的 tag 只可能是 0 和 1,而 0 和 1 在 Sigmoid 函数中对应的横坐标值为无穷,因此,我们无法用最小二乘法来拟合参数。
不过我们知道,这个世界上除了最小二乘法还有一个很伟大的回归算法叫极大似然法,如果把 sigmoid 函数看作求概率的方法(实际上数学上也可以证明 sigmoid 函数是一种什么概率模型来着的),我们就可以用求最大似然估计函数的最大值点,来拟合参数。
难点就是我们要自己写一个多元搜索算法(见本人文章最上方提到的最优化算法实现笔记)。
有了多元搜索算法,这个实现起来就简单了。
不包含最优化算法代码
import numpy as np
import pandas as pd
from numpy import log,exp
from 多维直接搜索算法 import univarite_search_technique
def logistic_regression(X,Y):
'''
逻辑回归 (对数几率回归)
要求样本数大于维数
Y可以是一维数组或二维数组(1为正例 0为反例)
返回 array[w1, w2, ... , wd, b]
'''
X = np.mat(X)
Y = np.mat(Y)
m = len(X)
if len(Y) == 1:
Y = Y.T
X = np.c_[X,np.ones(m)]
def fun(*w):
nonlocal X,Y
w = np.matrix(w)
return np.sum(-np.multiply(Y.T,w*X.T)+log(1+exp(w*X.T)))
beta = univarite_search_technique(fun,X.shape[1])
return beta
if __name__ == '__main__':
from numpy import random
w = [random.randint(1,10) for i in range(5)]
b = random.randint(-5,5)
print('real w&b')
print(*w,b,sep='\t')
X = []
Y = []
for i in range(20):
X.append([])
Y.append(b+random.random()*4-2)
for j in range(5):
X[-1].append(random.random()*20-10)
Y[-1] += w[j]*X[-1][-1]
if Y[-1] > 0:
Y[-1] = 1
else:
Y[-1] = 0
wp = logistic_regression(X,Y)
wp = list(map(lambda x:round(x,2),wp))
print('predict w&b')
print(*wp,sep='\t')
完整代码(无需导入其他文件)
import numpy as np
import pandas as pd
from numpy import log,exp
def success_failure_method(f,x0=0,h=5,maxiters=1000):
'''
成功-失败法求搜索区间
必须传入有最小值的严格凸函数
有可能搜索区间长度为0 代表输入的x0即为最优解
'''
y0 = f(x0)
y1 = f(x0+h)
while maxiters:
if y0 > y1:
y2 = f(x0+3*h)
if y1 > y2:
x0 = x0 + h
y0 = y1
y1 = y2
h = 2*h
maxiters -= 1
continue
else:
if h > 0:
return x0,x0+3*h
else:
return x0+3*h,x0
elif y0 == y1:
if h > 0:
return x0,x0+h
else:
return x0+h,x0
else:
h = -h/4
y1 = f(x0+h)
maxiters -= 1
continue
print('成功失败法传入了非凸函数')
return 0,0
def golden_section_method(f,a,b,acy=0.001,maxiters=1000):
'''
黄金分割法求最小值点
传入函数f 搜索区间[a, b]
'''
alpha = 0.381966
beta = 1 - alpha
x1 = a + alpha*(b-a)
x2 = a + b - x1
y1 = f(x1)
y2 = f(x2)
while maxiters:
if y1 > y2:
a = x1
if b - a < acy:
break
else:
x1 = x2
y1 = y2
x2 = alpha*a + beta*b
y2 = f(x2)
maxiters -= 1
continue
else:
b = x2
if b - a < acy:
break
else:
x2 = x1
y2 = y1
x1 = beta*a + alpha*b
y1 = f(x1)
maxiters -= 1
continue
else:
print('黄金分割法未收敛到精度')
return (a+b)/2
def one_dimensional_search(f,x0=0,method='golden_section',acy=0.001,maxiters=1000,h=5):
'''
一维搜素算法
method可选择如下:
golden_section
'''
a,b = success_failure_method(f,x0=x0,h=h,maxiters=maxiters)
if method == 'golden_section':
ans = golden_section_method(f,a,b,acy=acy,maxiters=maxiters)
else:
print('Input method does not exist!')
return None
return ans
def univarite_search_technique(f,d,ini_point=None,acy=0.01,maxiters=1000):
'''
坐标轮换法
传入函数 维度 初始点
'''
if ini_point is None:
point = [0 for i in range(d)]
else:
point = list(ini_point)
difference = 1000
old = f(*point)
while abs(difference) > acy and maxiters:
for i in range(d):
part1 = point[:i]
part2 = point[i+1:]
fun = lambda x:f(*part1,x,*part2)
t = one_dimensional_search(fun,point[i])
point[i] = t
new = f(*point)
difference = new - old
old = new
maxiters -= 1
return point
def logistic_regression(X,Y):
'''
逻辑回归 (对数几率回归)
要求样本数大于维数
Y可以是一维数组或二维数组(1为正例 0为反例)
返回 array[w1, w2, ... , wd, b]
'''
X = np.mat(X)
Y = np.mat(Y)
m = len(X)
if len(Y) == 1:
Y = Y.T
X = np.c_[X,np.ones(m)]
def fun(*w):
nonlocal X,Y
w = np.matrix(w)
return np.sum(-np.multiply(Y.T,w*X.T)+log(1+exp(w*X.T)))
beta = univarite_search_technique(fun,X.shape[1])
return beta
if __name__ == '__main__':
from numpy import random
w = [random.randint(1,10) for i in range(5)]
b = random.randint(-5,5)
print('real w&b')
print(*w,b,sep='\t')
X = []
Y = []
for i in range(20):
X.append([])
Y.append(b+random.random()*4-2)
for j in range(5):
X[-1].append(random.random()*20-10)
Y[-1] += w[j]*X[-1][-1]
if Y[-1] > 0:
Y[-1] = 1
else:
Y[-1] = 0
wp = logistic_regression(X,Y)
wp = list(map(lambda x:round(x,2),wp))
print('predict w&b')
print(*wp,sep='\t')
测试结果,对比下生成数据的逻辑,和测试的参数成不成比例就可以说明算法正确性了。
LDA 线性判别分析
这部分因为不仅要实现参数 (投影方向) 的估计,还要实现预测功能,因此我先写了计算投影方向的函数,然后再写了LDA类。
仅计算投影方向的函数
import numpy as np
def linear_discriminant_analysis(X,Y):
'''
LDA线性判别分析
要求样本数大于维数
Y可以是一维数组或二维数组(1为正例 0为反例)
返回 array[w1, w2, ... , wd]
'''
n0 = []
n1 = []
X = np.mat(X)
Y = np.mat(Y)
m = len(X)
if len(Y) == 1:
Y = Y.T
for i in range(m):
if not int(Y[i]):
n0.append(i)
else:
n1.append(i)
X0 = X[n0]
X1 = X[n1]
miu0 = np.mean(X0,0)
miu1 = np.mean(X1,0)
w_scatter = 0
for i in range(len(X0)):
w_scatter += (X0[i]-miu0).T*(X0[i]-miu0)
for i in range(len(X1)):
w_scatter += (X1[i]-miu1).T*(X1[i]-miu1)
w = np.linalg.inv(w_scatter)*(miu0-miu1).T
w = np.hstack(np.array(w))
return w
if __name__ == '__main__':
from numpy import random
w = [random.randint(1,10) for i in range(5)]
b = random.randint(-5,5)
print('real w&b')
print(*w,b,sep='\t')
X = []
Y = []
for i in range(20):
X.append([])
Y.append(b+random.random()*4-2)
for j in range(5):
X[-1].append(random.random()*20-10)
Y[-1] += w[j]*X[-1][-1]
if Y[-1] > 0:
Y[-1] = 1
else:
Y[-1] = 0
wp = linear_discriminant_analysis(X,Y)
wp = list(map(lambda x:round(x,2),wp))
print('predict w')
print(*wp,sep='\t')
LDA类
import numpy as np
class Linear_Discriminant_Analysis:
def __init__(self):
pass
def train(self,X,Y):
n0 = []
n1 = []
X = np.mat(X)
Y = np.mat(Y)
m = len(X)
if len(Y) == 1:
Y = Y.T
for i in range(m):
if not int(Y[i]):
n0.append(i)
else:
n1.append(i)
X0 = X[n0]
X1 = X[n1]
miu0 = np.mean(X0,0)
miu1 = np.mean(X1,0)
w_scatter = 0
for i in range(len(X0)):
w_scatter += (X0[i]-miu0).T*(X0[i]-miu0)
for i in range(len(X1)):
w_scatter += (X1[i]-miu1).T*(X1[i]-miu1)
self.w = np.linalg.inv(w_scatter)*(miu0-miu1).T
self.d0 = self.w.T*miu0.T
self.d1 = self.w.T*miu1.T
def predict(self,x):
x = np.mat(x)
if len(x) == 1:
x = x.T
d = self.w.T*x
if abs(d-self.d0) < abs(d-self.d1):
return 0
else:
return 1
if __name__ == '__main__':
from numpy import random
w = [random.randint(1,10) for i in range(5)]
b = random.randint(-5,5)
X = []
Y = []
for i in range(20):
X.append([])
Y.append(b+random.random()*4-2)
for j in range(5):
X[-1].append(random.random()*20-10)
Y[-1] += w[j]*X[-1][-1]
if Y[-1] > 0:
Y[-1] = 1
else:
Y[-1] = 0
model = Linear_Discriminant_Analysis()
model.train(X,Y)
print('real:')
print(*Y,sep=', ')
print('predict:')
for x in X:
y = model.predict(x)
print(y,end=', ')
print()
决策树算法
ID3
使用 python3.7(致命bug已于2022/1/4 22:22修复)
# -*- coding: utf-8 -*-
from math import log
import pandas as pd
class DecisionNode:
'''
若不为叶子结点
attribute: 当前判断属性
values: 属性所有可选值[value1,value2...]
若为叶子结点
attribute: 标签名
values: 该分支对应标签值value
'''
def __init__(self, attribute=None, values=None, isleaf=False):
self.a = attribute
self.values = values
self.isleaf = isleaf
class DecisionTree:
def __init__(self, current=None):
self.current = current
self.nextTrees = {}
def addnextTree(self, value, nextTree):
self.nextTrees[value] = nextTree
class ID3:
def __init__(self):
'''
创建ID3学习器
'''
def train(self, D):
'''
D为DF对象
'''
self.decision_tree = TreeGenerate(D)
def predict(self, D, inputtype='dict'):
'''
D为DF对象或字典
'''
if inputtype == 'dict':
testDict = D.copy()
return findvalue(testDict, self.decision_tree)
def show(self):
printTree(self.decision_tree)
def TreeGenerate(D):
tag = D.columns[-1]
if isAllSame(D[tag]):
# 标签值全相同
node = DecisionNode(tag, D.loc[0][tag], isleaf=True)
return DecisionTree(node)
elif len(D.columns) == 1:
# 属性集全空 正常结束
value = getCountMax(D[tag])
node = DecisionNode(tag, value, isleaf=True)
return DecisionTree(node)
else:
default = True
for a in D.columns[:-1]:
if not isAllSame(D[tag]):
default = False
break
if default:
# 属性值全相同
value = getCountMax(D[tag])
node = DecisionNode(tag, value, isleaf=True)
return DecisionTree(node)
else:
a = choose(D)
values = []
subsets = division(D, a)
for key in subsets:
values.append(key)
node = DecisionNode(a, values.copy())
tree = DecisionTree(node)
for key in subsets:
tree.addnextTree(key, TreeGenerate(subsets[key]))
return tree
def findvalue(target, tree):
'''
target 为一个字典
'''
if tree.current.isleaf:
return tree.current.values
elif not target:
print('无法判断')
return 0
else:
if tree.current.a in target:
value = target[tree.current.a]
else:
print('预测对象传入属性缺失')
return 0
del target[tree.current.a]
nexttree = tree.nextTrees[value]
return findvalue(target, nexttree)
def printTree(tree,startstr=''):
s1 = str(tree.current.a)
if tree.current.isleaf:
print(startstr+s1+':'+str(tree.current.values))
else:
for value in tree.nextTrees:
print(startstr + s1 + ':' + str(value))
printTree(tree.nextTrees[value],startstr+'|\t')
print(startstr)
def Ent(D):
'''
传入一个DF对象 最后一列为标签列 返回其信息熵
'''
ent = 0
tmp = {}
s = 0
tag = D.columns[-1]
for d in D[tag]:
if d in tmp:
tmp[d] += 1
else:
tmp[d] = 1
s += 1
for key in tmp:
ent -= tmp[key] / s * log(tmp[key] / s, 2)
return ent
def division(D, a):
'''
传入DF对象和作为分割条件的属性 返回分割后的DF列表
'''
order = {} # a的各个属性值对应子DF
columns_new = list(D.columns)
columns_new.remove(a)
for i in range(len(D)):
if D.loc[i][a] in order:
order[D.loc[i][a]].append(i)
else:
order[D.loc[i][a]] = [i]
for key in order:
order[key] = D.loc[order[key], columns_new]
# 这一步一定要重置DF的索引
order[key].reset_index(inplace=True,drop=True)
return order
def Gain(D, a):
'''
传入一个DF对象 最后一列为标签列 返回其信息增益
'''
gain = Ent(D)
subsets = division(D, a)
for key in subsets:
subset = subsets[key]
gain -= len(subset) / len(D) * Ent(subset)
return gain
def choose(D):
'''
选择信息增益最高的属性返回
'''
maxgain = -1
for a in D.columns[:-1]:
gain = Gain(D, a)
if gain > maxgain:
maxgain = gain
ans = a
return ans
def isAllSame(L):
tmp = []
for x in L:
if x not in tmp:
tmp.append(x)
if len(tmp) == 1:
return True
else:
return False
def getCountMax(L):
tmp = {}
count = 0
for x in L:
if x in tmp:
tmp[x] += 1
else:
tmp[x] = 1
for x in tmp:
if tmp[x] > count:
count = tmp[x]
ans = x
return ans
def readexcel(filepath):
'''
要求传入的xlsx文件有表头(属性名) 无索引列
'''
df = pd.read_excel(filepath)
return df
def readcsv(filepath, sep=','):
'''
要求传入的xlsx文件有表头(属性名) 无索引列
df.columns[:-1] 返回其属性(不含标签)索引列表
'''
df = pd.read_csv(filepath, sep=sep)
return df
if __name__ == '__main__':
df = readcsv('wm2.0.csv')
model = ID3()
model.train(df)
test = {'色泽':'青绿','根蒂':'稍蜷','敲声':'浊响',
'纹理':'稍糊','脐部':'凹陷','触感':'硬滑'}
model.show()
ans = model.predict(test)
print('-------以下为预测部分-------')
print(ans)
wm2.0.csv 文件如下
色泽,根蒂,敲声,纹理,脐部,触感,好瓜
青绿,蜷缩,浊响,清晰,凹陷,硬滑,是
乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,是
乌黑,蜷缩,浊响,清晰,凹陷,硬滑,是
青绿,蜷缩,沉闷,清晰,凹陷,硬滑,是
浅白,蜷缩,浊响,清晰,凹陷,硬滑,是
青绿,稍蜷,浊响,清晰,稍凹,软粘,是
乌黑,稍蜷,浊响,稍糊,稍凹,软粘,是
乌黑,稍蜷,浊响,清晰,稍凹,硬滑,是
乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,否
青绿,硬挺,清脆,清晰,平坦,软粘,否
浅白,硬挺,清脆,模糊,平坦,硬滑,否
浅白,蜷缩,浊响,模糊,平坦,软粘,否
青绿,稍蜷,浊响,稍糊,凹陷,硬滑,否
浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,否
乌黑,稍蜷,浊响,清晰,稍凹,软粘,否
浅白,蜷缩,浊响,模糊,平坦,硬滑,否
青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,否
神经网络
import numpy as np
感知机
神经网络的基础单位是感知机 (Perceptron),由输入层、输出层两层神经元组成
class Perceptron:
def __init__(self, x_dimension, learning_rate=0.1):
self.weight = np.ones(x_dimension+1)
self.eta = learning_rate
def set_learning_rate(self, learning_rate=0.1):
self.eta = learning_rate
def update(self,x,y):
x = np.append(x,[-1])
y_ = sigmoid(np.sum(x*self.weight))
self.weight += self.eta*(y-y_)*x
def output(self,x):
x = np.append(x,[-1])
y_ = sigmoid(np.sum(x*self.weight))
return y_
def sigmoid(x):
return 1/(1+np.exp(-x))
多层前馈神经网络(标准BP算法)
经过测试发现,多层神经网络确实可以实现异或,但这和创建网络时的各权重参数和阈值参数的初始值相关,否则会陷入局部极值的困境,需考虑其他方法解决。
以下代码需运行多遍才能训练成功,因此实际上正如书上所说,需结合启发式算法(退火、遗传)或在更新权值步骤加入随机参数进行训练。
此外本人有个猜想是,增大隐层数,是否能增大训练成功的概率,但暂未进行理论和实验证明。
class Perceptron:
def __init__(self, x_dimension, learning_rate=0.1, israndom=True):
if israndom:
self.weight = np.random.random(x_dimension)*4-2
self.weight = np.append(self.weight,np.random.random(1)*2)
else:
self.weight = np.ones(x_dimension+1)
self.eta = learning_rate
def set_learning_rate(self, learning_rate=0.1):
self.eta = learning_rate
def set_weight(self,w):
self.weight = w
def get_weight(self):
return self.weight
def output(self,x):
x = np.append(x,[-1])
y_ = sigmoid(np.sum(x*self.weight))
return y_
class FNN:
'''
使用标准BP算法训练的多层前馈神经网络
'''
def __init__(self,input_size,hidden_size,output_size,eta=0.1):
self.hidden_layer = [Perceptron(input_size) for i in range(hidden_size)]
self.output_layer = [Perceptron(hidden_size) for i in range(output_size)]
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.eta = eta
def predict(self,x):
'''
x type: np.array
'''
b = []
for p in self.hidden_layer:
b.append(p.output(x))
b = np.array(b)
y = []
for p in self.output_layer:
y.append(p.output(b))
y = np.array(y)
return y
def update_by_step(self,x,y):
'''
输入单个 x-y 进行更新
'''
b = []
for p in self.hidden_layer:
b.append(p.output(x))
b = np.array(b)
y_ = []
for p in self.output_layer:
y_.append(p.output(b))
y_ = np.array(y_) # y的预测值
g = y_*(1-y_)*(y-y_)
w_output = [] # 用来保存输出层每个M-P神经元的权重参数
alter_w_output = []
for j in range(self.output_size):
w_output.append(self.output_layer[j].get_weight())
alter_w_output.append(self.output_layer[j].get_weight())
alter_w_output += self.eta*g[j]*np.append(b,[-1])
self.output_layer[j].set_weight(alter_w_output[j])
w_output = np.array(w_output)
e = b*(1-b)*np.sum(w_output.T*g,0)
alter_w_hidden = []
for i in range(self.hidden_size):
alter_w_hidden.append(self.hidden_layer[i].get_weight())
alter_w_hidden[-1] += self.eta*e[i]*np.append(x,[-1])
self.hidden_layer[i].set_weight(alter_w_hidden[i])
def sigmoid(x):
return 1/(1+np.exp(-x))
if __name__ == '__main__':
network = FNN(2,2,1)
data_x = [[1,1],[1,0],[0,1],[0,0]]
data_y = [1,0,0,1]
for i in range(1000):
for j in range(4):
network.update_by_step(data_x[j],data_y[j])
for i in range(4):
print(network.predict(data_x[i]))
支持向量机
解决书上的一些疑问
- 书上公式 (6.3) 中,假设了支持向量到超平面的距离为1,是为了方便后面推导公式,左侧注释部分有提到这里省略了放缩步骤。
- 书上公式 (6.11) 的推导是先推到
再代入式 (6.9) 得到的。 - 使用SMO算法求出 后,代回式 (6.9) 即可求出 。
SMO算法
推导闭式解步骤
先推导下书上SMO算法中提到的闭式解,记 m 为样本数。
先假设
把书上提到的对偶问题中目标变量的常数部分删去,得到
使用拉格朗日法后,对 和 求导后得到的式子为
消 后
最后把式子 代入即可解得 和
循环算法设计
可恶,上面闭式解计算的时候,把分子最前面的负号忘记了,导致我花了十多个小时去改进我的算法。发现算法已经调整得极其完美了还是Error,然后又花了几个小时,数学证明了下书上的算法可以得到最优解,最后又花了几个小时,将错误的结果一步步反推总算发现我公式里的这个问题了。先介绍下我做出的算法的改进吧。
- 设计了复杂度相对较低且高效的初始化参数步骤。
- 摒弃书上用于更新偏移量b的方法,采用新的方法以确保无法确定支持向量时,仍然能计算出合适的偏移量。
- 关于SMO算法中选取两个变量的方法,以书上的思路为基础,设计了较高效的方法。
- 虽然目前算法不支持核函数和软间隔,但是个人认为还是很好拓展的,以后再做(doge)。
硬间隔支持向量机
predict 方法暂时就不写了,反正得到 和
import numpy as np
import matplotlib.pyplot as plt
class SVM:
def __init__(self,x_dimension):
self.w = np.zeros(x_dimension)
self.b = 0
def train(self,X,Y,acy=0.01,maxiters=1000):
'''
X type: list with np.array
Y type: list with value(1 or -1)
'''
# step1 初始化
m = len(Y)
for i in range(m):
self.w += X[i]*Y[i]
self.w = self.w/m
# 本人决定在初始化b多花点时间
self.initialize_b(X,Y)
# test!
print('初始化w,b为:',end=' ')
print(self.w,self.b)
#
self.alphas = [1/m for i in range(m)]
self.supports = []
# step2 SMO
while maxiters:
oldw = self.w
oldb = self.b
self.sequential_minimal_optimization(X,Y)
alter = np.sum(np.abs(self.w-oldw))+abs(self.b-oldb)
if alter < acy:
break
maxiters -= 1
else:
print('Train Failed')
if np.sum(self.alphas) == 0:
print('Train Failed')
else:
print('Train Success')
def initialize_b(self,X,Y):
m = len(Y)
set1 = []
set2 = []
for i in range(m):
if Y[i] == 1:
set1.append(np.sum(self.w*X[i]))
else:
set2.append(np.sum(self.w*X[i]))
margin = [min(set1),max(set1),min(set2),max(set2)]
margin.sort()
self.b = -(margin[1]+margin[2])/2
def sequential_minimal_optimization(self,X,Y):
zero = 0.000001
m = len(Y)
# step1 确定第一个变量更新顺序
outcirculation = []
other1 = []
other2 = []
other3 = []
fun = [Y[k]*(np.sum(self.w*X[k])-self.b)-1 for k in range(m)]
for i in range(m):
if self.alphas[i] > zero:
if fun[i] < -zero:
outcirculation.append(i)
elif fun[i] > zero:
other1.append(i)
else:
other3.append(i)
else:
if fun[i] < -zero:
other2.append(i)
else:
other3.append(i)
outcirculation.sort(key=lambda x:fun[x])
other1.sort(key=lambda x:fun[x],reverse=True)
other2.sort(key=lambda x:fun[x])
outcirculation += other1
outcirculation += other2
#outcirculation += other3
del other1,other2,other3,fun
# step2-4 循环
incirculation = list(range(m))
for first in outcirculation:
# step2 确定与第一个变量对应的第二个变量
fun = [np.sum(self.w*X[k]) for k in range(m)]
maxvalue = 0
second = first
for i in incirculation:
value = abs(fun[i]-fun[first])
if Y[first]*Y[i] < 0 and value > maxvalue:
maxvalue = value
second = i
# test!
'''print('选择的两个更新目标为:',end=' ')
print(first,second)'''
#
# step3 计算更新alpha
if first != second:
new1,new2 = self.update_alpha(X,Y,first,second)
self.alphas[first] = new1
self.alphas[second] = new2
# step4 更新w
neww = 0
newsupport = list(range(m))
for i in range(m):
neww = neww + self.alphas[i]*Y[i]*X[i]
if self.alphas[i] == 0:
newsupport.remove(i)
self.w = neww
# step5 更新b
self.initialize_b(X,Y)
# test!
print('更新后w,b为:',end=' ')
print(self.w,self.b)
#print(self.alphas)
#
def update_alpha(self,X,Y,i,j):
'''
i,j from 0, 1, ... , m-1
'''
Q = list(range(len(Y)))
Q.remove(i)
Q.remove(j)
q1 = Y[i]-Y[j]
c1 = (X[i]-X[j])*Y[i]*Y[j]
c2 = 0
c3 = 0
for k in Q:
c2 = c2 + self.alphas[k]*Y[k]*X[k]
c3 = c3 - self.alphas[k]*Y[k]
q2 = np.sum(c1*c2)
q3 = c1*c3
p1 = np.sum(c1*(X[i]-X[j]))
alpha1 = max((-q1-q2-np.sum(q3*X[j]))/Y[i]/p1,0)
alpha2 = max((-q2-q1+np.sum(q3*X[i]))/Y[j]/p1,0)
return alpha1,alpha2
if __name__ == '__main__':
data_x = []
data_y = []
w1 = 0.6
w2 = 0.4
for i in range(50):
x1 = np.random.randint(-20,40)
x2 = np.random.randint(0,60)
f = w1*x1+w2*x2
if f > 20:
data_x.append(np.array([x1,x2]))
data_y.append(1)
elif f < 16:
data_x.append(np.array([x1,x2]))
data_y.append(-1)
model = SVM(2)
model.train(data_x,data_y,maxiters=100)
fig = plt.figure()
for i in range(len(data_x)):
if data_y[i] == 1:
plt.scatter(*data_x[i],c='r')
else:
plt.scatter(*data_x[i],c='g')
xx = np.linspace(-20,40,1000)
yy = (-model.b-model.w[0]*xx)/model.w[1]
plt.plot(xx,yy)
plt.show()
核函数思路
本人先写以下实现核函数的思路,日后有空再补完代码。
所谓核函数,最原本的概念都能理解,推导到最后就是将原本两个向量求内积,也就是 拓展成 ,像原本的 就被称为线性核。我们在推导SMO算法闭式解的时候,很容易发现, 就是传入的样本数据,就是个常数,因此核函数的引入,对算法本身影响是不大的。当然要注意的是,不仅仅是SMO算法求解两个变量的过程中,更新 、 两个参数的时候也要用这个核函数。
线性核本人代码的实现方法就是两个 np.array 对象相乘,再用 np.sum 函数求和,我们可以把原本代码中的 np.sum() 语句都替换成自定义的 kappa() 函数,传入个参数代表选择什么核就可以了。
def kappa(x1,x2,method='line'):
'''
input type: np.array
'''
if method == 'line':
return np.sum(x1*x2)
elif ...:
...
...
软间隔思路
软间隔可能要比核函数麻烦一点,根据书上的推导步骤(本人目前还没推导),推导到最后是在原本对偶问题的基础上,增加了一个限制条件 。并且KKT条件也有一些变化,这样改动的地方就有点多了,包括如何选择两个变量的步骤、求解更新后的变量需要增加判断条件等,尤其是本人原本写的不需要考虑支持向量就能更新
时隔一年,来写下软间隔对偶函数的推导。软间隔的优化问题可以写为,
进而写出拉格朗日函数,得到原问题等价形式,
其中和都非负,对 分别求导,得到,
将这三个式子带回拉格朗日函数,得到对偶问题的目标函数,由最后一个式子和和都非负的条件,得到对偶问题的约束条件,此外别忘了第二个式子也是约束条件。
也可以把展开来写,得到,
可以看出,软间隔对偶问题的目标函数和硬间隔一样,但对多了一个上限的约束。这样软间隔的SMO算法,大体思路就和硬间隔一致了。
聚类算法
k-Means
# -*- coding: utf-8 -*-
from math import sqrt
import random
import pandas as pd
import matplotlib.pyplot as plt
class kMeans:
def __init__(self,k=3):
self.k = k
self.clusts = [] # 存储每个簇均值
self.result = [] # 存储当前聚类结果
self.count = 0
def train(self,D,iter_max=100):
self.clusts = initialize(D,self.k)
self.clusts,result = update(self.clusts,D,self.k)
count = 1
for count in range(2,iter_max+1):
self.clusts,result_n = update(self.clusts,D,self.k)
if result_n == result:break
result = result_n
self.result = result
self.count = count
self.data = D
def predict(self,D):
pass
def show(self,is2d=False):
if is2d:
colorlist = ['b','r','g','y','pink','black','darkmagenta']
fig = plt.figure()
for i in range(self.k):
plt.scatter(self.data.loc[self.result[i]][self.data.columns[0]],
self.data.loc[self.result[i]][self.data.columns[1]],
c=colorlist[i])
plt.show()
else:
for clust in self.result:
print(clust)
print('迭代次数:%d'%(self.count))
def dist(x,y,dtype='euclidean'):
'''
计算两向量间的距离 要求x y长度相同
'''
ans = 0
if dtype == 'euclidean':
for i in range(len(x)):
ans += (y[i]-x[i])**2
ans = sqrt(ans)
elif dtype == 'manhattan':
for i in range(len(x)):
ans += abs(y[i]-x[i])
return ans
def initialize(remain,k):
'''
初始化簇 返回初始簇
remain 代表剩余样本 DaraFrame对象
'''
selects = random.sample(range(len(remain)),k)
clusts = [[remain.loc[j][i] for i in remain.columns] for j in selects]
'''
remain = remain.drop(selects)
remain.reset_index(inplace=True,drop=True)
'''
return clusts
def update(clusts,df,k):
'''
传入 当前每个簇的均值向量 和 训练集
返回更新后的均值向量和聚类二维列表
'''
result = [[] for i in range(k)]
new = []
dimension = len(df.columns)
for i in range(len(df)):
n = choose(list(df.loc[i]),clusts,k)
result[n].append(i)
for i in range(k):
new.append([0]*dimension)
for j in range(len(result[i])):
for x in range(dimension):
new[i][x] += df.loc[result[i][j]][df.columns[x]]
for x in range(dimension):
new[i][x] /= j+1
return new,result
def choose(single,clusts,k):
'''
返回选择的簇在clusts中的序号
'''
mind = dist(single,clusts[0])
n = 0
for i in range(1,k):
d = dist(single,clusts[i])
if d < mind:
mind = d
n = i
return n
if __name__ == '__main__':
df = pd.read_csv('wm.4.0.csv')
model = kMeans(2)
model.train(df)
model.show()
model.show(is2d=True)
wm4.0.csv 文件如下`
密度,含糖率
0.697,0.460
0.403,0.237
0.245,0.057
0.593,0.042
0.748,0.232
0.751,0.489
0.774,0.376
0.481,0.149
0.343,0.099
0.719,0.103
0.714,0.346
0.532,0.472
0.634,0.264
0.437,0.211
0.639,0.161
0.359,0.188
0.483,0.312
0.473,0.376
0.608,0.318
0.666,0.091
0.657,0.198
0.339,0.241
0.478,0.437
0.725,0.445
0.556,0.215
0.243,0.267
0.360,0.370
0.282,0.257
0.525,0.369
0.446,0.459