文章目录

一、数据集准备

MNIST数据集:​​点击下载​​ data2image.py —将mnist数据集转成image保存。训练集6W张,测试集1W张

import numpy as np
import struct
import cv2
import uuid


train_images_idx3_ubyte_file = 'E:/data/raw/train-images.idx3-ubyte' # 训练集文件
train_labels_idx1_ubyte_file = 'E:/data/raw/train-labels.idx1-ubyte' # 训练集标签文件

test_images_idx3_ubyte_file = 'E:/data/raw/t10k-images.idx3-ubyte' # 测试集文件
test_labels_idx1_ubyte_file = 'E:/data/raw/t10k-labels.idx1-ubyte' # 测试集标签文件


def decode_idx3_ubyte(idx3_ubyte_file):
"""
:param idx3_ubyte_file: idx3文件路径
:return: 数据集
"""
bin_data = open(idx3_ubyte_file, 'rb').read() # 解析文件头信息,依次为魔数、图片数量、每张图片高、每张图片宽
offset = 0
fmt_header = '>iiii' #因为数据结构中前4行的数据类型都是32位整型,所以采用i格式,但我们需要读取前4行数据,所以需要4个i。我们后面会看到标签集中,只使用2个ii。
magic_number, num_images, num_rows, num_cols = struct.unpack_from(fmt_header, bin_data, offset)
print('魔数:%d, 图片数量: %d张, 图片大小: %d*%d' % (magic_number, num_images, num_rows, num_cols))

# 解析数据集
image_size = num_rows * num_cols
offset += struct.calcsize(fmt_header) #获得数据在缓存中的指针位置,从前面介绍的数据结构可以看出,读取了前4行之后,指针位置(即偏移位置offset)指向0016。
print(offset)
fmt_image = '>' + str(image_size) + 'B' #图像数据像素值的类型为unsigned char型,对应的format格式为B。这里还有加上图像大小784,是为了读取784个B格式数据,如果没有则只会读取一个值(即一副图像中的一个像素值)
print(fmt_image,offset,struct.calcsize(fmt_image))
images = np.empty((num_images, num_rows, num_cols))
for i in range(num_images):
if (i + 1) % 10000 == 0:
print('已解析 %d' % (i + 1) + '张')
print(offset)
images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset)).reshape((num_rows, num_cols))
offset += struct.calcsize(fmt_image)
return images


def decode_idx1_ubyte(idx1_ubyte_file):
"""
解析idx1文件的通用函数
:param idx1_ubyte_file: idx1文件路径
:return: 数据集
"""
# 读取二进制数据
bin_data = open(idx1_ubyte_file, 'rb').read()

# 解析文件头信息,依次为魔数和标签数
offset = 0
fmt_header = '>ii'
magic_number, num_images = struct.unpack_from(fmt_header, bin_data, offset)
print('魔数:%d, 图片数量: %d张' % (magic_number, num_images))

# 解析数据集
offset += struct.calcsize(fmt_header)
fmt_image = '>B'
labels = np.empty(num_images)
for i in range(num_images):
if (i + 1) % 10000 == 0:
print ('已解析 %d' % (i + 1) + '张')
labels[i] = struct.unpack_from(fmt_image, bin_data, offset)[0]
offset += struct.calcsize(fmt_image)
return labels


def load_train_images(idx_ubyte_file=train_images_idx3_ubyte_file):
"""
:param idx_ubyte_file: idx文件路径
:return: n*row*col维np.array对象,n为图片数量
"""
return decode_idx3_ubyte(idx_ubyte_file)


def load_train_labels(idx_ubyte_file=train_labels_idx1_ubyte_file):
"""
:param idx_ubyte_file: idx文件路径
:return: n*1维np.array对象,n为图片数量
"""
return decode_idx1_ubyte(idx_ubyte_file)


def load_test_images(idx_ubyte_file=test_images_idx3_ubyte_file):
"""
:param idx_ubyte_file: idx文件路径
:return: n*row*col维np.array对象,n为图片数量
"""
return decode_idx3_ubyte(idx_ubyte_file)


def load_test_labels(idx_ubyte_file=test_labels_idx1_ubyte_file):
"""
:param idx_ubyte_file: idx文件路径
:return: n*1维np.array对象,n为图片数量
"""
return decode_idx1_ubyte(idx_ubyte_file)



if __name__ == '__main__':
# train_images = load_train_images()
# train_labels = load_train_labels()
# save_path = 'E:/data/raw/train_image/'
# for i in range(len(train_images)):
# cv2.imwrite(save_path+str(int(train_labels[i]))+ '_'+str(i) + '.png',train_images[i].astype(np.uint8))
# print('done')

test_images = load_test_images()
test_labels = load_test_labels()
save_path = 'E:/data/raw/test_image/'
for i in range(len(test_images)):
cv2.imwrite(save_path+str(int(test_labels[i]))+ '_'+str(i)+'.png',test_images[i].astype(np.uint8))
print('done')

二、训练

使用svm进行多分类训练。【关于SVM理论】,【关于SVM超平面推导】。

在机器学习实战书中提到 将MNIST-image数据集运用SVM进行识别,以线性核函数、RBF核函数、Sigmoid核函数为核函数,以OVO和OVR策略,两两搭配,共有六种组合,以这六种组合分别训练和测试数据集,并记录其训练时间、测试时间、准确率和平均准确率,得到如下两表:

SVM 实现mnist 手写数字图像识别_核函数


造成核函数和策略组合性能不同的原因如下:

(1)线性核函数、RBF核函数和Sigmoid核函数公式的复杂度不同,导致训练时间和测试时间出现差异。

(2)理论上,OVR只需要训练N个分类器,而OVO需要训练N(N-1)/2个分类器,因此OVO的存储开销和测试时间开销通常比OVR更大。而在训练时,OVR的每个分类器均使用全部训练样例,而OVO的每个分类器仅用到两个类的样例。因此,在类别很多的时候,OVO的训练时间开销通常比OVR更小。

(3)手写数字识别中,各种数字写法复杂,这明显是线性不可分的情景,所以线性核函数的准确率较低。

2.1 模型训练

所有训练集在同一个文件夹类

from PIL import Image
import os
import numpy as np
import time
from sklearn import svm
import joblib



def get_img(path):
return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".png")]


def img2vector(imgFile):
img = Image.open(imgFile).convert('L')
img_arr = np.array(img, 'i') # 28px * 28px 灰度图像
img_normalization = np.round(img_arr / 255) # 归一化
img_arr2 = np.reshape(img_normalization, (1, -1)) # 1 * 400 矩阵
return img_arr2


def read_and_convert(imgFileList):
dataNum = len(imgFileList) # 所有图片
dataLabel = np.zeros(dataNum,dtype=np.uint8) # 存放类标签
dataMat = np.zeros((dataNum, 784)) # dataNum * 400 的矩阵(一行为一张图的数据)
for i in range(dataNum):
img_path = imgFileList[i]
dataLabel[i] = img_path.split("/")[-1][0] # 得到类标签(数字)
dataMat[i, :] = img2vector(img_path)
return dataMat, dataLabel


# 读取训练数据
def read_all_data(train_data_path):
img_list = get_img(train_data_path)
dataMat, dataLabel = read_and_convert(img_list)
return dataMat, dataLabel

# create model
def create_svm(dataMat, dataLabel, path, decision='ovr'):
clf = svm.SVC( C=1.0, kernel='rbf', decision_function_shape=decision)
rf = clf.fit(dataMat, dataLabel)
joblib.dump(rf, path)
return clf


'''
SVC参数
svm.SVC(C=1.0,kernel='rbf',degree=3,gamma='auto',coef0=0.0,shrinking=True,probability=False,
tol=0.001,cache_size=200,class_weight=None,verbose=False,max_iter=-1,decision_function_shape='ovr',random_state=None)

C:C-SVC的惩罚参数C?默认值是1.0
C越大,相当于惩罚松弛变量,希望松弛变量接近0,即对误分类的惩罚增大,这样对训练集测试时准确率很高,但泛化能力弱。
C值小,对误分类的惩罚减小,允许容错,将他们当成噪声点,泛化能力较强。

kernel :核函数,默认是rbf,可以是‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
0 – 线性: u'v
1 – 多项式:(gamma*u'*v + coef0)^degree
2 – RBF函数:exp(-gamma|u-v|^2)
3 –sigmoid: tanh(gamma*u'*v + coef0)
degree :多项式poly函数的维度,默认是3,选择其他核函数时会被忽略。(没用)
gamma : ‘rbf’,‘poly’ 和‘sigmoid’的核函数参数。默认是’auto’,则会选择1/n_features
coef0 : 核函数的常数项。对于‘poly’和 ‘sigmoid’有用。(没用)
probability :是否采用概率估计?.默认为False
shrinking : 是否采用shrinking heuristic方法,默认为true
tol : 停止训练的误差值大小,默认为1e-3
cache_size : 核函数cache缓存大小,默认为200
class_weight :类别的权重,字典形式传递。设置第几类的参数C为weight*C(C-SVC中的C)
verbose : 允许冗余输出?
max_iter : 最大迭代次数。-1为无限制。
decision_function_shape :‘ovo’, ‘ovr’ or None, default=None3(选用ovr,一对多)
random_state : 数据洗牌时的种子值,int值

主要调节的参数有:C、kernel、degree、gamma、coef0
'''

if __name__ == '__main__':
train_data_path = "E:/data/raw/train_image/"
dataMat, dataLabel = read_all_data(train_data_path)
print("read data done!")

st = time.time()
model_path = 'E:/data/raw/model/svm.model'
create_svm(dataMat, dataLabel, model_path, decision='ovr')
et = time.time()
print("Training spent {:.4f}s.".format((et - st)))
2.1 模型预测

所有测试集在同一个文件夹类

import time
import os
import joblib
from PIL import Image
import numpy as np


def get_img(path):
return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".png")]

def img2vector(imgFile):
img = Image.open(imgFile).convert('L')
img_arr = np.array(img, 'i') # 28px * 28px 灰度图像
img_normalization = np.round(img_arr / 255) # 归一化
img_arr2 = np.reshape(img_normalization, (1, -1)) # 1 * 400 矩阵
return img_arr2

def svm_test(test_data_path,model_path):
clf = joblib.load(model_path) # 加载模型
img_list = get_img(test_data_path)

t0 = time.time()
f = open("E:/data/raw/pre_result.txt",'w')
f.write('imgName,actual,pre_result,accuracy_rate,FPS')
f.write("\n")

error_count = 0
for i in range(len(img_list)):
img_path = img_list[i]
dataLabel = img_path.split("/")[-1][0]
dataMat = img2vector(img_path)
preResult = clf.predict(dataMat)[0]
print("num :"+str(i+1),"dataLabel :" ,dataLabel, " preResult:",preResult)
if str(preResult) != dataLabel:
error_count+=1

accuracy_rate = (i + 1 - error_count) / (i + 1) * 100
print("The accuracy rate is: ",accuracy_rate, "%")

t1 = time.time()
FPS = (i + 1) / (t1 - t0)
print("FPS is:",FPS)
f.write(",".join([img_path,dataLabel,str(preResult),str(accuracy_rate),str(FPS)]))
f.write("\n")
f.close()

if __name__ == '__main__':
test_data_path = "E:/data/raw/test_image/"
model_path = 'E:/data/raw/model/svm.model'
svm_test(test_data_path,model_path)

在上述中预测准确率97.45%

三、模型优化

【关于网格搜索】​ ​​【多特征融合】​

3.1 多特征拼接

使用了投影特征:水平投影与垂直投影

# 增加了投影函数
def project(img_arr):
height,width = img_arr.shape[:2]

# 垂直投影:统计并存储每一列的白点数
vertical = np.zeros((1,width),dtype=np.int32)
for x in range(0, width):
for y in range(0, height):
if img_arr[y, x] != 0:
vertical[0][x]+=1

# 水平投影 #统计每一行的白点数
horizontal = np.zeros((1,height),dtype=np.int32)
for y in range(0, height):
for x in range(0, width):
if img_arr[y, x] != 0:
horizontal[0][y] += 1
vertical_norm = np.divide(vertical,height)
horizontal_norm = np.divide(horizontal,width)
return vertical_norm,horizontal_norm


# 修改了函数实现特征拼接
def img2vector(imgFile):
_, thresh = cv2.threshold(cv2.imread(imgFile,0), 127, 255, cv2.THRESH_BINARY) # 28px * 28px 灰度图像
img_normalization = np.round(thresh / 255) # 归一化
img_arr2 = np.reshape(img_normalization, (1, -1)) # 1 * 400 矩阵
vertical_norm,horizontal_norm = project(thresh)
img_arr3 = np.concatenate((img_arr2,vertical_norm),axis=1)
img_arr4 = np.concatenate((img_arr3,horizontal_norm),axis=1)
return

在上述1W张测试集中预测准确率97.67%左右,并没有提升。由于该特征与原特征的差异性导致了该问题。

3.2 使用多模型融合

采取多模型的权重分配决定最终预测结果。

trian.py

from PIL import Image
import os
import numpy as np
import time
from sklearn import svm
import joblib
import cv2


def get_img(path):
return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".png")]


class Make_features():
def __init__(self, imgFile):
self.imgFile = imgFile
_, thresh = cv2.threshold(cv2.imread(self.imgFile, 0), 127, 255, cv2.THRESH_BINARY)
self.thresh = thresh


def features_vertical(self):
height, width = self.thresh.shape[:2]

# 垂直投影:统计并存储每一列的白点数
vertical = np.zeros((1, width), dtype=np.int32)
for x in range(0, width):
for y in range(0, height):
if self.thresh[y, x] != 0:
vertical[0][x] += 1
vertical_norm = np.divide(vertical, height)
return vertical_norm

def features_horizont(self):
height, width = self.thresh.shape[:2]

# 水平投影 #统计每一行的白点数
horizont = np.zeros((1, height), dtype=np.int32)
for y in range(0, height):
for x in range(0, width):
if self.thresh[y, x] != 0:
horizont[0][y] += 1
horizont_norm = np.divide(horizont, width)
return horizont_norm

def features_img2vector(self):
img_normalization = np.round(self.thresh / 255) # 归一化
img_arr4 = np.reshape(img_normalization, (1, -1)) # 1 * 400 矩阵
return img_arr4



# 读取训练数据
def Read_All_Img_convert(train_data_path):
imgFileList = get_img(train_data_path)
dataNum = len(imgFileList) # 所有图片
dataLabel = np.zeros(dataNum, dtype=np.uint8) # 存放类标签
dataMat_imgarray = np.zeros((dataNum, 784)) # dataNum * 400 的矩阵(一行为一张图的数据)
dataMat_vertical = np.zeros((dataNum, 28))
dataMat_horizont = np.zeros((dataNum, 28))
for i in range(dataNum):
img_path = imgFileList[i]
dataLabel[i] = img_path.split("/")[-1][0] # 得到类标签(数字)
dataMat_imgarray[i, :] = Make_features(img_path).features_img2vector()
dataMat_vertical[i, :] = Make_features(img_path).features_vertical()
dataMat_horizont[i, :] = Make_features(img_path).features_horizont()
return dataMat_imgarray, dataMat_vertical,dataMat_horizont,dataLabel


class SVMs():
def __init__(self,model_path,dataMat,dataLabel):
self.model_path = model_path
self.dataMat = dataMat
self.dataLabel = dataLabel

# create model
def create_svm(self,decision='ovr'):
clf = svm.SVC(C=1.0, kernel='rbf', decision_function_shape=decision, probability=True)
rf = clf.fit(self.dataMat, self.dataLabel)
joblib.dump(rf, self.model_path)


if __name__ == '__main__':
train_data_path = "E:/data/raw/train_image/"
dataMat_imgarray, dataMat_vertical,dataMat_horizont,dataLabel = Read_All_Img_convert(train_data_path)
print("read data done!")


model_path0 = 'E:/data/raw/model/svm4_0.model'
model_path1 = 'E:/data/raw/model/svm4_1.model'
model_path2 = 'E:/data/raw/model/svm4_2.model'

t0 = time.time()
SVMs(model_path=model_path0,dataMat=dataMat_imgarray,dataLabel=dataLabel).create_svm()
t1 = time.time()
print("Training spent {:.4f}s.".format((t1 - t0)))


SVMs(model_path=model_path1,dataMat=dataMat_vertical,dataLabel=dataLabel).create_svm()
t2 = time.time()
print("Training spent {:.4f}s.".format((t2 - t1)))

SVMs(model_path=model_path2,dataMat=dataMat_horizont,dataLabel=dataLabel).create_svm()
t3 = time.time()
print("Training spent {:.4f}s.".format((t3 - t2)))

test.py

import time
import os
import joblib
import numpy as np
import cv2


def get_img(path):
return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".png")]


class Make_features():
def __init__(self, imgFile):
self.imgFile = imgFile
_, thresh = cv2.threshold(cv2.imread(self.imgFile, 0), 127, 255, cv2.THRESH_BINARY)
self.thresh = thresh


def features_vertical(self):
height, width = self.thresh.shape[:2]

# 垂直投影:统计并存储每一列的白点数
vertical = np.zeros((1, width), dtype=np.int32)
for x in range(0, width):
for y in range(0, height):
if self.thresh[y, x] != 0:
vertical[0][x] += 1
vertical_norm = np.divide(vertical, height)
return vertical_norm

def features_horizont(self):
height, width = self.thresh.shape[:2]

# 水平投影 #统计每一行的白点数
horizont = np.zeros((1, height), dtype=np.int32)
for y in range(0, height):
for x in range(0, width):
if self.thresh[y, x] != 0:
horizont[0][y] += 1
horizont_norm = np.divide(horizont, width)
return horizont_norm

def features_img2vector(self):
img_normalization = np.round(self.thresh / 255) # 归一化
img_arr4 = np.reshape(img_normalization, (1, -1)) # 1 * 400 矩阵
return img_arr4


class SVM_TEST():
def __init__(self,test_data_path,model_path0,model_path1,model_path2):
self.model_path0 = model_path0
self.model_path1 = model_path1
self.model_path2 = model_path2
self.test_data_path = test_data_path


def svm_test(self):
clf0 = joblib.load(self.model_path0)
clf1 = joblib.load(self.model_path1)
clf2 = joblib.load(self.model_path2)
img_list = get_img(self.test_data_path)

t0 = time.time()
error_count = 0
f = open("E:/data/raw/pre_result4_2.txt", 'w')
f.write('imgName,actual,pre_result,accuracy_rate,FPS')
f.write("\n")

for i in range(len(img_list)):
img_path = img_list[i]
dataLabel = img_path.split("/")[-1][0]
dataMat_imgarray = Make_features(img_path).features_img2vector()
dataMat_vertical = Make_features(img_path).features_vertical()
dataMat_horizont = Make_features(img_path).features_horizont()

preResult0 = clf0.predict_proba(dataMat_imgarray)
preResult1 = clf1.predict_proba(dataMat_vertical)
preResult2 = clf2.predict_proba(dataMat_horizont)
pre_labe = np.argmax(0.8*preResult0+0.1*preResult1+0.1*preResult2)

if str(pre_labe) != dataLabel:
error_count += 1

accuracy_rate = (i + 1 - error_count) / (i + 1) * 100
print("The accuracy rate is: ", accuracy_rate, "%")

t1 = time.time()
FPS = (i + 1) / (t1 - t0)
print("FPS is:", FPS)
f.write(",".join([img_path, dataLabel, str(pre_labe), str(accuracy_rate), str(FPS)]))
f.write("\n")
f.close()


if __name__ == '__main__':
test_data_path = "E:/data/raw/test_image/"
model_path0 = 'E:/data/raw/model/svm4_0.model'
model_path1 = 'E:/data/raw/model/svm4_1.model'
model_path2 = 'E:/data/raw/model/svm4_2.model'
SVM_TEST(test_data_path,model_path0,model_path1,model_path2).svm_test()

在上述1W张测试集中预测准确率97.80%左右,并没有明显提升。其实我们可以提取更多的有效特征

其实投影的特征真心不行。

还要就是考虑到图像预处理速度,预测速度。特征也不宜过多。

我查看了原始数据,外国人的手写数据的一些奇怪写法:其实我们可以删除摸棱两可以及奇怪的字符,来提高准确率。例如:(数据集中的:0,1,2)

SVM 实现mnist 手写数字图像识别_核函数_02


附:skl2onnx

# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple skl2onnx
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple scikit-learn
import joblib
import onnxmltools
from skl2onnx.common.data_types import FloatTensorType
import numpy as np
import onnxruntime


def skl_to_onnx():
input_skl_model = "../weight/svm_xxx.model"
input_data_type = [('float_input', FloatTensorType([4, 22 * 32]))] # 输入数据维度[h,w]
output_onnx_model = "../weight/svm_xxx.onnx"
skl_model = joblib.load(input_skl_model)
onnx_model = onnxmltools.convert_sklearn(skl_model, initial_types=input_data_type)
onnxmltools.utils.save_model(onnx_model, output_onnx_model)



def start_svm():
num2label_svm = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
x_input = np.load("../weight/svm_00GS.npy")
svm_ort = onnxruntime.InferenceSession('../weight/svm_xxx.onnx')
ort_inputs = {svm_ort.get_inputs()[0].name: x_input.astype(np.float32)}
ort_outs = svm_ort.run(None, ort_inputs)
print(len(ort_outs), ort_outs[0])
pre_ys = "".join([num2label_IISIS[i] for i in ort_outs[0]])
dst = "".join(pre_ys)


if __name__ == '__main__':
skl_to_onnx()
start_svm()