利用 SVM( Support Vector Machine) 做分类是机器学习比较成熟的算法。 关于SVM, 我有一篇博文已经详细的介绍了其原理:
传送门: SVM 原理简述
今天,我们利用Python 的OpenCV中的ML模块进行SVM 而分类的演练。
首先是Binary Classification.
__author__ = "Luke Liu"
#encoding="utf-8"
import cv2
import sklearn
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn import metrics
from sklearn import datasets
def plot_descison_boundary(model,X_Train,y_train):
h=0.02 #确定网格间距
x_min,x_max = X_Train[:,0].min()-1,X_Train[:,0].max()+1 #确定feature 1的range
x2_min,x2_max = X_Train[:,1].min()-1,X_Train[:,1].max()+1
#确定 feature 2 的range
xx,xx2=np.meshgrid(
np.arange(x_min,x_max,h), #确定 feature 1与 feature 2 的位置矩阵
np.arange(x2_min,x2_max,h),
)
X_hyop =np.c_[ xx.ravel().astype(np.float32),#将连个矩阵进行合并,成为(F1,F2)矩阵
xx2.ravel().astype(np.float32)]
ret,zz = model.predict(X_hyop) # predict()返回值为两个,一个bool,一个predict-label
zz = zz.reshape(xx.shape) #制作等高线,需要xx,yy,zz的shape相同
plt.contourf(xx,xx2,zz,cmap=plt.cm.coolwarm,alpha=0.8) #以冷热与基调
plt.scatter(X_Train[:,0],X_Train[:,1],c=y_train,s=150)
plt.show()
if __name__=="__main__":
X,y = datasets.make_classification(n_samples=600,n_features=2,
n_informative=2,
n_classes=2,n_redundant=0,
random_state=100)
print("The shape of X:\n{}".format(X.shape))
print("The shape of y:\n{}".format(y.shape))
plt.figure(figsize=(12,6))
plt.scatter(X[:,0],X[:,1],s=100,c=y)
plt.xlabel("fecture 1 (Total 8)")
plt.ylabel("fecture 2 (Total 8)")
import numpy as np
X = X.astype(np.float32)
y_new=[]
for i in y:
if i ==0:
y_new.append(-1)
if i!=0:
y_new.append(1)
y = np.array(y_new)
X_train,X_test,y_train,y_test=model_selection.train_test_split(
X,y,test_size=0.1,random_state=42)
svm1 = cv2.ml.SVM_create()
svm1.setKernel(cv2.ml.SVM_LINEAR)
svm1.train(X_train,cv2.ml.ROW_SAMPLE,y_train)
ret,y_predit = svm1.predict(X_test)
scores = metrics.accuracy_score(y_test,y_predit)
print(scores)
plot_descison_boundary(svm1,X,y)
值得注意的点是: 首先是datasets.make_classification的参数:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
其中 n_class的数量以及n_cluster_per_class和n_informative的数量密切相关。不注意的会引发如下错误:
ValueError: n_classes * n_clusters_per_class must be smaller or equal 2 ** n_informative
注意,在用SVM做2分时候,Target的值要做一定的预处理。即y为-1和1,至于为什么要这么做,参考上文提到的Blog。
对应代码段为:
y_new=[]
for i in y:
if i ==0:
y_new.append(-1)
if i!=0:
y_new.append(1)
y = np.array(y_new)
接着是核函数的选择:
用 分类器的类方法: setKernel(核名称)
比较有名的核基制为 径向基函数(radical basis function),一个常见的例子是高斯函数。RBF仅仅依赖于参考点的距离。
我们的核函数有:
CV2.ml.SVM_LINEAR 在原始特征空间中线性划分。
CV2.ml.SVM_POLY 在原始空间中使用多项式函数划分。需要指定系数svm.setCoef,和项数svm.setDegree
cv2.ml.SVM_RBF 在高维空间高斯函数
其中可以
svm.setType(SVM.C_SVC); //SVM的类型,默认是:SVM.C_SVC
svm.setGamma(0.5);//核函数的参数
svm.setNu(0.5);//SVM优化问题参数
svm.setC(1);//SVM优化问题的参数C
cv2.ml.SVM_Sigmod Sigmod函数 ,和Logistic regression相似
cv2.ml.SVM_INTER 利用直方图相似。
OUTPUT:
1.用原始空间,即Linear核:
正确率:86.3%
2.高维空间核
正确率:95%
SVM 多分类尝试(多个分类器):
__author__ = "Luke Liu"
#encoding="utf-8"
import cv2
import sklearn
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn import metrics
from sklearn import datasets
# def plot_descison_boundary(model,X_Train,y_train):
# h=0.02 #确定网格间距
# x_min,x_max=X_Train[:,0].min()-1,X_Train[:,0].max()+1 #确定feature 1的range
# x2_min,x2_max=X_Train[:,1].min()-1,X_Train[:,1].max()+1
#
# #确定 feature 2 的range
# xx,xx2=np.meshgrid(
# np.arange(x_min,x_max,h), #确定 feature 1与 feature 2 的位置矩阵
# np.arange(x2_min,x2_max,h),
# )
# X_hyop =np.c_[ xx.ravel().astype(np.float32),#将连个矩阵进行合并,成为(F1,F2)矩阵
# xx2.ravel().astype(np.float32),
# ]
#
# _,zz = model.predict(X_hyop) # predic()返回值为两个,一个bool,一个predict-label
# zz = zz.reshape(xx.shape) #制作等高线,需要xx,yy,zz的shape相同
# plt.contourf(xx,xx2,zz,cmap=plt.cm.coolwarm,alpha=0.8) #以冷热与基调
# plt.scatter(X_Train[:,0],X_Train[:,1],c=y_train,s=150)
# plt.show()
if __name__=="__main__":
X,y = datasets.make_classification(n_samples=600,n_features=8,
n_informative=8,
n_classes=3,n_redundant=0,
random_state=100)
temp = y.copy()
print("The shape of X:\n{}".format(X.shape))
print("The shape of y:\n{}".format(y.shape))
plt.figure(figsize=(12,6))
plt.scatter(X[:,0],X[:,1],s=100,c=y)
plt.xlabel("fecture 1 (Total 8)")
plt.ylabel("fecture 4 (Total 8)")
plt.show()
#数据预处理
import numpy as np
X = X.astype(np.float32)
y_new=[]
cnt=0
pick_up_points=[]
#一次分类
for i in y:
if i ==0:
y_new.append(-1)
cnt+=1
if i!=0:
y_new.append(1)
cnt+=1
pick_up_points.append(cnt-1)
y = np.array(y_new)
# 再次分类X2,y2
X2=[]
y2=[]
for x_mark in pick_up_points:
X2.append(X[x_mark])
X2 = np.array(X2)
for y_mark in pick_up_points:
y2.append(temp[y_mark])
y2= np.array(y2)
y2= 2*y2-3
X_train,X_test,y_train,y_test=model_selection.train_test_split(
X,y,test_size=0.2,random_state=42)
X2_train,X2_test,y2_train,y2_test= model_selection.train_test_split(
X,y,test_size=0.2,random_state=42
)
svm1 = cv2.ml.SVM_create()
svm2 = cv2.ml.SVM_create()
svm1.setKernel(cv2.ml.SVM_LINEAR)
svm2.setKernel(cv2.ml.SVM_LINEAR)
svm1.train(X_train,cv2.ml.ROW_SAMPLE,y_train)
svm2.train(X2_train,cv2.ml.ROW_SAMPLE,y2_train)
y_predict_true = []
pick_one_mark=[] # 预测为特征1的下标list
pick_two_mark=[] # 预测值值目前待定的下标list
cnt2=0
ret,y_predit = svm1.predict(X_test)
for y in y_predit:
if y[0]==-1:
cnt2+=1
pick_one_mark.append(cnt2-1)
if y[0]==1:
cnt2+=1
pick_two_mark.append(cnt2-1)
X2_test_undefined =[]
for i in pick_two_mark:
X2_test_undefined.append(X_test[i])
X2_test_undefined = np.array(X2_test_undefined)
ret,y_predit_2 = svm2.predict(X2_test_undefined)
# plot_descison_boundary(svm1,X,y)