支持向量机
SVM(Support Vector Machine),适合用于中小型复杂数据集的分类。
支持向量机有三宝
- 间隔
- 对偶
- 核技巧
大间隔分类(Large margin classification)
from sklearn.svm import SVC
from sklearn import datasets
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
iris=datasets.load_iris()#加载数据集
X=iris["data"][:,(2,3)] #花瓣长与宽
y=iris["target"] #标签
setosa_or_versicolor = (y==0)|(y==1) #y标签中,如果==0 or ==1 则将其元素为True 否则为 False
print(len(setosa_or_versicolor))
X=X[setosa_or_versicolor] #时True则选中,False不选中
y=y[setosa_or_versicolor]
# 鸢尾花或月亮(setosa_or_versicolor)
print(len(X))
#SVM Classifier model
svm_clf=SVC(kernel="linear",C=float("inf"))
svm_clf.fit(X,y)
150 100 SVC(C=inf, kernel='linear')
模型使用
def plot_svc_decision_boundary(svm_clf,xmin,xmax):
w=svm_clf.coef_[0]
b=svm_clf.intercept_[0]
# At the decision boundary, w0*x0 + w1*x1 + b = 0
# => x1 = -w0/w1 * x0 - b/w1
x0 = np.linspace(xmin, xmax, 200)
decision_boundary = -w[0]/w[1] * x0 - b/w[1]
margin = 1/w[1]
gutter_up = decision_boundary + margin
gutter_down = decision_boundary - margin
svs = svm_clf.support_vectors_
plt.scatter(svs[:, 0], svs[:, 1], s=180, facecolors='#FFAAAA')# scatter-分散
plt.plot(x0, decision_boundary, "k-", linewidth=2)
plt.plot(x0, gutter_up, "k--", linewidth=2)
plt.plot(x0, gutter_down, "k--", linewidth=2)
print(svs)#输出支持向量
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", label="Iris versicolor")
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", label="Iris setosa")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=14)
plot_svc_decision_boundary(svm_clf, 0, 5.5)
plt.axis([0, 5.5, 0, 2])
plt.show()
[[1.9 0.4] [3. 1.1]]
特征尺度敏感性(Sensitivity to feature scales)
#未进行特征缩放
Xs = np.array([[1, 50], [5, 20], [3, 80], [5, 60]]).astype(np.float64)
ys = np.array([0, 0, 1, 1])
svm_clf = SVC(kernel="linear", C=100)
svm_clf.fit(Xs, ys)
plot_svc_decision_boundary(svm_clf, 0, 6)
plt.plot(Xs[:, 0][ys==1], Xs[:, 1][ys==1], "bo")
plt.plot(Xs[:, 0][ys==0], Xs[:, 1][ys==0], "ms")
plt.show()
#进行特征缩放
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_scaled=scaler.fit_transform(Xs)
svm_clf.fit(X_scaled,ys)
plot_svc_decision_boundary(svm_clf, -2, 2)
plt.plot(X_scaled[:, 0][ys==1], X_scaled[:, 1][ys==1], "bo")
plt.plot(X_scaled[:, 0][ys==0], X_scaled[:, 1][ys==0], "ms")
plt.show()
[[ 1. 50.] [ 5. 60.]]
[[-1.50755672 -0.11547005] [ 0.90453403 -1.5011107 ] [ 0.90453403 0.34641016]]
软间隔分类
如果你的SVM模型过拟合,可以尝试通过降低C来对其进行正则化
import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
iris=datasets.load_iris()
X=iris["data"][:,(2,3)]# length width
y=(iris["target"] == 2).astype(np.float64)
svm_clf = Pipeline([
("scaler", StandardScaler()),
("linear_svc", LinearSVC(C=100, loss="hinge")),
])
svm_clf.fit(X, y)
svm_clf.predict([[5.5, 1.7]])
array([1.])