文章目录
- 数据准备
- 支持向量机三部曲
- 线性可分支持向量机
- 线性支持向量机
- 非线性支持向量机
- SVM-SMO 序列最小优化最算法
- 总结
- 参考
数据准备
为了验证SVM模型实现和sklearn的正确性,文中使用的是随机生成的100组数据,便于画图进行对比,数据在SVM文件夹当中:https://github.com/phdsky/xCode/tree/main/机器学习/统计学习方法/svm/dataset.txt
本文主要实现的是带有软间隔的线性支持向量机,因此使用的数据是两类别数据mnist_binary.csv,表中对原手写数据中0~4取作负类 -1,将5~9取作正类 +1,符合SVM原始判定条件。
支持向量机三部曲
支持向量机的理论推导明晰,层次递进清楚;主要由以下三部曲构成:线性可分支持向量机,线性可分支持向量机,非线性支持向量机。由浅入深,留坑待填。
线性可分支持向量机
线性支持向量机
非线性支持向量机
SVM-SMO 序列最小优化最算法
本文主要对线性支持向量机模型进行测试实现,同时引入核函数满足非线性要求但并未测试效果,实现过程采用SMO序列最小最优化算法。
SMO算法流程如下:
基于SMO算法实现的SVM模型代码如下:
# @Author: phd
# @Date: 2019-10-22
# @Site: github.com/phdsky
# @Description: NULL
import time
import logging
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Binarizer
def log(func):
def wrapper(*args, **kwargs):
start_time = time.time()
ret = func(*args, **kwargs)
end_time = time.time()
logging.debug('%s() consts %s seconds' % (func.__name__, end_time - start_time))
return ret
return wrapper
def calc_accuracy(y_pred, y_truth):
assert len(y_pred) == len(y_truth)
n = len(y_pred)
hit_count = 0
for i in range(0, n):
if y_pred[i] == y_truth[i]:
hit_count += 1
print("Predicting accuracy %f" % (hit_count / n))
def sign(x):
if x >= 0:
return 1
elif x < 0:
return -1
else:
print("Sign function input wrong!\n")
exit(-1)
class SVM(object):
def __init__(self, features, labels, kernelType='linear', C=1.0, epsilon=0.001, tolerance=0.001):
self.kernelType = kernelType
self.C = C # punish parameter
self.epsilon = epsilon # slack
self.tolerance = tolerance # tolerance
self.X = features
self.Y = labels
self.numOfSamples = len(self.X)
self.b = 0.
self.alpha = [0.]*self.numOfSamples
# self.Ei = [self.calculate_Ei(i) for i in range(self.numOfSamples)]
self.Ei = [0.]*self.numOfSamples
def kernel(self, X1, X2):
X1 = np.asarray(X1)
X2 = np.asarray(X2)
if self.kernelType == 'linear':
return np.inner(X1, X2)
elif self.kernelType == 'poly':
self.p = 2 # assumed poly value
return (np.inner(X1, X2))**self.p
elif self.kernelType == 'gaussian':
self.sigma = 10 # assumed sigma value
return np.exp(-np.inner((X1 - X2), (X1 - X2)) / (2 * self.sigma**2))
else:
print("WTF kernel type is?")
exit(-1)
def calculate_gxi(self, i):
return self.b + sum([self.alpha[j]*self.Y[j]*self.kernel(self.X[i], self.X[j])
for j in range(self.numOfSamples)])
def calculate_Ei(self, i):
return self.calculate_gxi(i) - self.Y[i]
def is_satisfy_KKT(self, i):
if (self.alpha[i] == 0) and (self.Y[i]*self.calculate_gxi(i) >= 1. - self.epsilon):
return True
elif (0. < self.alpha[i] < self.C) and (np.fabs(self.Y[i]*self.calculate_gxi(i) - 1.) <= self.epsilon):
return True
elif (self.alpha[i] == self.C) and (self.Y[i]*self.calculate_gxi(i) <= 1. + self.epsilon):
return True
return False
def select_two_parameters(self):
# First, select all 0 < alpha < C sample points check these points satisfy KKT or not
# If all these points(0 < alpha < C) satisfy KKT
# Then should check all sample points whether satisfy KKT
# Select one that breaks KKT, and another one has max |E1 - E2| value
allPointsIndex = [i for i in range(self.numOfSamples)]
conditionPointsIndex = list(filter(lambda c: 0 < self.alpha[c] < self.C, allPointsIndex))
unConditionPointsIndex = list(set(allPointsIndex) - set(conditionPointsIndex))
reArrangePointsIndex = conditionPointsIndex + unConditionPointsIndex
for i in reArrangePointsIndex:
if self.is_satisfy_KKT(i):
continue
maxIndexEi = (0, 0.) # (key, value)
E1 = self.Ei[i]
for j in allPointsIndex:
if i == j:
continue
E2 = self.Ei[j]
if np.fabs(E1 - E2) > maxIndexEi[1]:
maxIndexEi = (j, np.fabs(E1 - E2))
return i, maxIndexEi[0]
return 0, 0
def select_i2(self, i1, E1):
E2 = 0
i2 = -1
max_E1_E2 = -1
non_zero_Ei = [ei for ei in range(self.numOfSamples) if self.calculate_Ei(ei) != 0]
for e in non_zero_Ei:
E2_tmp = self.calculate_Ei(e)
if np.fabs(E1 - E2_tmp) > max_E1_E2:
max_E1_E2 = np.fabs(E1 - E2_tmp)
E2 = E2_tmp
i2 = e
if i2 == -1:
i2 = i1
while i2 == i1:
i2 = int(random.uniform(0, self.numOfSamples))
E2 = self.calculate_Ei(i2)
# E2 = self.Ei[i2]
return i2, E2
def smo_trunk(self, i1):
E1 = self.calculate_Ei(i1)
# E1 = self.Ei[i1]
if not self.is_satisfy_KKT(i1):
i2, E2 = self.select_i2(i1, E1)
print(i1, i2)
alpha_i1_old = self.alpha[i1]
alpha_i2_old = self.alpha[i2]
if self.Y[i1] != self.Y[i2]:
L = np.fmax(0., alpha_i2_old - alpha_i1_old)
H = np.fmin(self.C, self.C + alpha_i2_old - alpha_i1_old)
elif self.Y[i1] == self.Y[i2]:
L = np.fmax(0., alpha_i2_old + alpha_i1_old - self.C)
H = np.fmin(self.C, alpha_i2_old + alpha_i1_old)
else:
print("WTF of this condition?")
exit(-1)
if L == H:
return 0
eta = (self.kernel(self.X[i1], self.X[i1]) + self.kernel(self.X[i2], self.X[i2])) - \
(self.kernel(self.X[i1], self.X[i2]) * 2.)
if eta <= 0:
return 0
alpha2_new_unclipped = alpha_i2_old + (self.Y[i2]*(E1 - E2) / eta)
if alpha2_new_unclipped >= H:
alpha2_new_clipped = H
elif L < alpha2_new_unclipped < H:
alpha2_new_clipped = alpha2_new_unclipped
elif alpha2_new_unclipped <= L:
alpha2_new_clipped = L
else:
print("WTF of the alpha2_new_uncliped value?")
print(i1, i2, alpha2_new_unclipped, eta)
exit(-1)
if np.fabs(alpha2_new_clipped - alpha_i2_old) < self.tolerance:
return 0
s = self.Y[i1]*self.Y[i2]
alpha1_new = alpha_i1_old + s*(alpha_i2_old - alpha2_new_clipped)
b1 = - E1 \
- self.Y[i1]*self.kernel(self.X[i1], self.X[i1])*(alpha1_new - alpha_i1_old) \
- self.Y[i2]*self.kernel(self.X[i2], self.X[i1])*(alpha2_new_clipped - alpha_i2_old)\
+ self.b
b2 = - E2 \
- self.Y[i1]*self.kernel(self.X[i1], self.X[i2])*(alpha1_new - alpha_i1_old) \
- self.Y[i2]*self.kernel(self.X[i2], self.X[i2])*(alpha2_new_clipped - alpha_i2_old) \
+ self.b
if 0 < alpha1_new < self.C:
b = b1
elif 0 < alpha2_new_clipped < self.C:
b = b2
else:
b = (b1 + b2) / 2.
self.b = b
self.alpha[i1] = alpha1_new
self.alpha[i2] = alpha2_new_clipped
# Update all error cache Ei value
self.Ei = [self.calculate_Ei(i) for i in range(self.numOfSamples)]
# self.Ei[i1] = self.calculate_Ei(i1)
# self.Ei[i2] = self.calculate_Ei(i2)
return 1
else:
return 0
def check_not_bound(self):
return [nb for nb in range(self.numOfSamples) if 0 < self.alpha[nb] < self.C]
@log
def train(self, maxIteration=50):
iterNum = 0
iterEntireSet = True
alphaPairsChanged = 0
while (iterNum < maxIteration) and (alphaPairsChanged > 0 or iterEntireSet):
iterNum += 1
print("Iteration: %d of %d" % (iterNum, maxIteration))
alphaPairsChanged = 0
if iterEntireSet:
for i in range(self.numOfSamples):
alphaPairsChanged += self.smo_trunk(i)
else:
not_bound_list = self.check_not_bound()
for i in not_bound_list:
alphaPairsChanged += self.smo_trunk(i)
if iterEntireSet:
iterEntireSet = False
else:
iterEntireSet = True
@log
def predict(self, X_test):
n = len(X_test)
predict_label = np.full(n, -2)
for i in range(0, n):
to_predict = X_test[i]
result = self.b
for j in range(self.numOfSamples):
result += self.alpha[j]*self.Y[j]*self.kernel(to_predict, self.X[j])
predict_label[i] = sign(result)
return predict_label
def visualize(self):
# Extract positive & negative samples
# conditionPointsIndex = list(filter(lambda c: 0 < self.alpha[c] < self.C, allPointsIndex))
positive_index = [pos for pos in range(self.numOfSamples) if self.Y[pos] == 1]
negative_index = [neg for neg in range(self.numOfSamples) if self.Y[neg] == -1]
plt.xlabel('X1') # -12 ~ 12
plt.ylabel('X2') # -6 ~ 6
positive = np.asarray(list(self.X[pos] for pos in positive_index))
negative = np.asarray(list(self.X[neg] for neg in negative_index))
plt.scatter(positive[:, 0], positive[:, 1], c='b', s=20)
plt.scatter(negative[:, 0], negative[:, 1], c='y', s=20)
for i, text in enumerate(np.arange(self.numOfSamples)):
plt.annotate(text, (self.X[i][0], self.X[i][1]))
support_index = [sv for sv in range(self.numOfSamples) if self.alpha[sv] != 0]
support_alpha = np.asarray(list(self.alpha[sv] for sv in support_index))
support_vector = np.asarray(list(self.X[sv] for sv in support_index))
support_label = np.asarray(list(self.Y[sv] for sv in support_index))
plt.scatter(support_vector[:, 0], support_vector[:, 1], c='', s=100, alpha=0.5, linewidths=1.5, edgecolor='red')
print("Support Vector Number: ", len(support_vector))
print(support_index)
X1 = np.arange(np.min([self.X[x][0] for x in range(self.numOfSamples)]),
np.max([self.X[x][0] for x in range(self.numOfSamples)]),
0.1)
X2 = np.arange(np.min([self.X[x][1] for x in range(self.numOfSamples)]),
np.max([self.X[x][1] for x in range(self.numOfSamples)]),
0.1)
x1, x2 = np.meshgrid(X1, X2)
g = self.b
for i in range(len(support_vector)):
if self.kernelType == 'linear':
g += support_alpha[i]*support_label[i]*(x1*support_vector[i][0] + x2*support_vector[i][1])
elif self.kernelType == 'poly':
print("Not Implement Yet...")
elif self.kernelType == 'gaussian':
g += support_alpha[i]*support_label[i]*\
np.exp(-0.5*((x1 - support_vector[i][0])**2 + (x2 - support_vector[i][1])**2) / (self.sigma**2))
else:
print("WTF kernel type is?")
exit(-1)
plt.contour(x1, x2, g, 0, colors='c')
plt.show()
print("Figure plot!")
def load_dataset(filename):
data = []
label = []
with open(filename) as fr:
for line in fr.readlines():
line = line.strip().split(' ')
data.append([float(line[0]), float(line[1])])
label.append(float(line[2]))
return data, label
if __name__ == "__main__":
# logger = logging.getLogger()
# logger.setLevel(logging.DEBUG)
# mnist_data = pd.read_csv("../data/mnist_binary.csv")
# mnist_values = mnist_data.values
#
# sample_num = 2000
# images = mnist_values[:sample_num, 1::]
# labels = mnist_values[:sample_num, 0]
#
# X_train, X_test, y_train, y_test = train_test_split(
# images, labels, test_size=0.33, random_state=42
# )
X_train, y_train = load_dataset('./dataset.txt')
svm = SVM(features=X_train, labels=y_train, kernelType='linear')
print("SVM training...")
svm.train()
print("\nTraining done...")
svm.visualize()
print(np.nonzero(svm.alpha), svm.b)
# print("Testing on %d samples..." % len(X_test))
# y_predicted = svm.predict(X_test=X_test)
# calc_accuracy(y_pred=y_predicted, y_truth=y_test)
输出结果:
SVM training...
Iteration: 1 of 50
0 2
3 55
7 75
8 42
10 42
11 75
14 75
17 42
21 75
23 42
24 75
27 75
29 42
30 42
35 75
37 75
52 42
53 75
65 75
70 75
76 75
84 75
94 75
95 75
96 75
97 42
Iteration: 2 of 50
0 75
2 42
Iteration: 3 of 50
0 75
2 42
7 75
8 42
10 42
11 75
14 75
17 42
21 75
23 42
24 75
27 75
29 42
30 42
35 75
37 75
52 42
53 75
65 75
70 75
76 75
84 75
94 75
95 75
96 75
97 42
Training done...
Support Vector Number: 4
[0, 2, 3, 55]
Figure plot!
(array([ 0, 2, 3, 55]),) -2.117589887010262
代码实现中有几个需要注意的地方:
- 《统计学习方法》一书中SMO算法流程的叙述和原始SMO算法的Paper《Sequential Minimal Optimization: A Fast Algorithm for Training Support Vector Machines》中的流程有点差别,并且原paper中给出了算法相应的伪代码;
- 在上面的实现代码中,select_two_parameters对应于《统计学习方法》中的变量选择方法;smo_trunk对应于paper当中的examineExample函数;最终实现是采用了原paper当中的算法流程;
- 书中和Paper中对于带松弛变量的KKT条件检验没有给出明确的式子;关于这一点我看了很多别人的花式实现,松弛变量和容差写到等式两边的都有;感觉这里会是支持向量选择的重点bug所在,最标准的还是得去看libsvm的代码中是怎么实现的;
- 书中在SMO算法最后,更新变量中提到:在每次完成两个变量的优化之后,还必须更新对应的 值,并将它保存在列表中。 值的更新要用到 值,以及所有支持向量对应的 : 其中,是所有支持向量
这里对要更新的 指代并不是很清楚,对应的 到底是指变量选择的 和 还是所有的 ;这里我个人认为既然已经在上面更新了模型的 值,那么对应的所有的 都需要更新;关于这点,很多花式实现只更新了 和 - 从上面的结果图中来看,很明显在支持向量选择上有BUG!;为了对比标准实验结果,下面用 sklearn 另外跑了一下该数据,得出对比结果并画出相应的分类结果图;除了上述两个模棱两可的点外,代码中可能还存在其他的bug,短期内暂时没时间调了,下次得拿更小点的数据来调试支持向量选择错误的问题;
sklearn版代码:
# @Author: phd
# @Date: 2019-11-05
# @Site: github.com/phdsky
# @Description: NULL
import numpy as np
import pandas as pd
from sklearn import svm
from matplotlib import pyplot as plt
def load_dataset(filename):
data = []; label = []
with open(filename) as fr:
for line in fr.readlines():
line = line.strip().split(' ')
data.append([float(line[0]), float(line[1])])
label.append(float(line[2]))
return data, label
def plot_point(data, label, support_vector, weight, bias):
for i in range(np.shape(data)[0]):
if label[i] == 1:
plt.scatter(data[i][0], data[i][1], c='b', s=20)
else:
plt.scatter(data[i][0], data[i][1], c='y', s=20)
for j in support_vector:
plt.scatter(data[j][0], data[j][1], s=100, c='', alpha=0.5, linewidth=1.5, edgecolor='red')
for i, text in enumerate(np.arange(len(data))):
plt.annotate(text, (data[i][0], data[i][1]))
x = np.arange(0, 10, 0.01)
y = (weight[0][0] * x + bias) / (-1 * weight[0][1])
plt.scatter(x, y, s=5, marker='h')
plt.show()
if __name__ == "__main__":
X_train, y_train = load_dataset('./dataset.txt')
clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
clf.fit(X_train, y_train)
support_vector_number = clf.n_support_
support_vector_index = clf.support_
w = clf.coef_
b = clf.intercept_
plot_point(X_train, y_train, support_vector_index, w, b)
print(w, b)
输出结果
[[ 0.81444269 -0.27274371]] [-3.83775658]
从图上来看,sklearn的效果完美;简单略过了一下sklearn的源码,svm这部分用的还是libsvm,看来有时间还是得研究下libsvm的源码。
总结
留坑
参考
- 《统计学习方法》