SVM 实现mnist 手写数字图像识别

原创

SongpingWang 2022-08-24 21:40:53 博主文章分类：机器学习—算法及代码 ©著作权

文章标签 机器学习核函数数据数据集 文章分类 机器学习人工智能

©著作权归作者所有：来自51CTO博客作者SongpingWang的原创作品，请联系作者获取转载授权，否则将追究法律责任

文章目录

一、数据集准备
二、训练

2.1 模型训练
2.1 模型预测

三、模型优化

3.1 多特征拼接
3.2 使用多模型融合

附：skl2onnx

一、数据集准备

MNIST数据集：点击下载 data2image.py —将mnist数据集转成image保存。训练集6W张，测试集1W张

import numpy as np
import struct
import cv2
import uuid


train_images_idx3_ubyte_file = 'E:/data/raw/train-images.idx3-ubyte' # 训练集文件
train_labels_idx1_ubyte_file = 'E:/data/raw/train-labels.idx1-ubyte' # 训练集标签文件

test_images_idx3_ubyte_file = 'E:/data/raw/t10k-images.idx3-ubyte'  # 测试集文件
test_labels_idx1_ubyte_file = 'E:/data/raw/t10k-labels.idx1-ubyte'  # 测试集标签文件


def decode_idx3_ubyte(idx3_ubyte_file):
    """
    :param idx3_ubyte_file: idx3文件路径
    :return: 数据集
    """
    bin_data = open(idx3_ubyte_file, 'rb').read()  # 解析文件头信息，依次为魔数、图片数量、每张图片高、每张图片宽
    offset = 0
    fmt_header = '>iiii' #因为数据结构中前4行的数据类型都是32位整型，所以采用i格式，但我们需要读取前4行数据，所以需要4个i。我们后面会看到标签集中，只使用2个ii。
    magic_number, num_images, num_rows, num_cols = struct.unpack_from(fmt_header, bin_data, offset)
    print('魔数:%d, 图片数量: %d张, 图片大小: %d*%d' % (magic_number, num_images, num_rows, num_cols))

    # 解析数据集
    image_size = num_rows * num_cols
    offset += struct.calcsize(fmt_header)  #获得数据在缓存中的指针位置，从前面介绍的数据结构可以看出，读取了前4行之后，指针位置（即偏移位置offset）指向0016。
    print(offset)
    fmt_image = '>' + str(image_size) + 'B'  #图像数据像素值的类型为unsigned char型，对应的format格式为B。这里还有加上图像大小784，是为了读取784个B格式数据，如果没有则只会读取一个值（即一副图像中的一个像素值）
    print(fmt_image,offset,struct.calcsize(fmt_image))
    images = np.empty((num_images, num_rows, num_cols))
    for i in range(num_images):
        if (i + 1) % 10000 == 0:
            print('已解析 %d' % (i + 1) + '张')
            print(offset)
        images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset)).reshape((num_rows, num_cols))
        offset += struct.calcsize(fmt_image)
    return images


def decode_idx1_ubyte(idx1_ubyte_file):
    """
    解析idx1文件的通用函数
    :param idx1_ubyte_file: idx1文件路径
    :return: 数据集
    """
    # 读取二进制数据
    bin_data = open(idx1_ubyte_file, 'rb').read()

    # 解析文件头信息，依次为魔数和标签数
    offset = 0
    fmt_header = '>ii'
    magic_number, num_images = struct.unpack_from(fmt_header, bin_data, offset)
    print('魔数:%d, 图片数量: %d张' % (magic_number, num_images))

    # 解析数据集
    offset += struct.calcsize(fmt_header)
    fmt_image = '>B'
    labels = np.empty(num_images)
    for i in range(num_images):
        if (i + 1) % 10000 == 0:
            print ('已解析 %d' % (i + 1) + '张')
        labels[i] = struct.unpack_from(fmt_image, bin_data, offset)[0]
        offset += struct.calcsize(fmt_image)
    return labels


def load_train_images(idx_ubyte_file=train_images_idx3_ubyte_file):
    """
    :param idx_ubyte_file: idx文件路径
    :return: n*row*col维np.array对象，n为图片数量
    """
    return decode_idx3_ubyte(idx_ubyte_file)


def load_train_labels(idx_ubyte_file=train_labels_idx1_ubyte_file):
    """
    :param idx_ubyte_file: idx文件路径
    :return: n*1维np.array对象，n为图片数量
    """
    return decode_idx1_ubyte(idx_ubyte_file)


def load_test_images(idx_ubyte_file=test_images_idx3_ubyte_file):
    """
    :param idx_ubyte_file: idx文件路径
    :return: n*row*col维np.array对象，n为图片数量
    """
    return decode_idx3_ubyte(idx_ubyte_file)


def load_test_labels(idx_ubyte_file=test_labels_idx1_ubyte_file):
    """
    :param idx_ubyte_file: idx文件路径
    :return: n*1维np.array对象，n为图片数量
    """
    return decode_idx1_ubyte(idx_ubyte_file)



if __name__ == '__main__':
    # train_images = load_train_images()
    # train_labels = load_train_labels()
    # save_path = 'E:/data/raw/train_image/'
    # for i in range(len(train_images)):
    #     cv2.imwrite(save_path+str(int(train_labels[i]))+ '_'+str(i) + '.png',train_images[i].astype(np.uint8))
    # print('done')

    test_images = load_test_images()
    test_labels = load_test_labels()
    save_path = 'E:/data/raw/test_image/'
    for i in range(len(test_images)):
        cv2.imwrite(save_path+str(int(test_labels[i]))+ '_'+str(i)+'.png',test_images[i].astype(np.uint8))
    print('done')

二、训练

使用svm进行多分类训练。【关于SVM理论】，【关于SVM超平面推导】。

在机器学习实战书中提到将MNIST-image数据集运用SVM进行识别，以线性核函数、RBF核函数、Sigmoid核函数为核函数，以OVO和OVR策略，两两搭配，共有六种组合，以这六种组合分别训练和测试数据集，并记录其训练时间、测试时间、准确率和平均准确率，得到如下两表：

SVM 实现mnist 手写数字图像识别_核函数

造成核函数和策略组合性能不同的原因如下：

（1）线性核函数、RBF核函数和Sigmoid核函数公式的复杂度不同，导致训练时间和测试时间出现差异。

（2）理论上，OVR只需要训练N个分类器，而OVO需要训练N(N-1)/2个分类器，因此OVO的存储开销和测试时间开销通常比OVR更大。而在训练时，OVR的每个分类器均使用全部训练样例，而OVO的每个分类器仅用到两个类的样例。因此，在类别很多的时候，OVO的训练时间开销通常比OVR更小。

（3）手写数字识别中，各种数字写法复杂，这明显是线性不可分的情景，所以线性核函数的准确率较低。

2.1 模型训练

所有训练集在同一个文件夹类

from PIL import Image
import os
import numpy as np
import time
from sklearn import svm
import joblib



def get_img(path):
    return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".png")]


def img2vector(imgFile):
    img = Image.open(imgFile).convert('L')
    img_arr = np.array(img, 'i')                       # 28px * 28px 灰度图像
    img_normalization = np.round(img_arr / 255)        # 归一化
    img_arr2 = np.reshape(img_normalization, (1, -1))  # 1 * 400 矩阵
    return img_arr2


def read_and_convert(imgFileList):
    dataNum = len(imgFileList)                         # 所有图片
    dataLabel = np.zeros(dataNum,dtype=np.uint8)       # 存放类标签
    dataMat = np.zeros((dataNum, 784))                 # dataNum * 400 的矩阵(一行为一张图的数据)
    for i in range(dataNum):
        img_path = imgFileList[i]
        dataLabel[i] = img_path.split("/")[-1][0]      # 得到类标签(数字)
        dataMat[i, :] = img2vector(img_path)
    return dataMat, dataLabel


# 读取训练数据
def read_all_data(train_data_path):
    img_list = get_img(train_data_path)
    dataMat, dataLabel = read_and_convert(img_list)
    return dataMat, dataLabel

# create model
def create_svm(dataMat, dataLabel, path, decision='ovr'):
    clf = svm.SVC( C=1.0, kernel='rbf', decision_function_shape=decision)
    rf = clf.fit(dataMat, dataLabel)
    joblib.dump(rf, path)
    return clf


'''
SVC参数
svm.SVC(C=1.0,kernel='rbf',degree=3,gamma='auto',coef0=0.0,shrinking=True,probability=False,
tol=0.001,cache_size=200,class_weight=None,verbose=False,max_iter=-1,decision_function_shape='ovr',random_state=None)

C：C-SVC的惩罚参数C?默认值是1.0
   C越大，相当于惩罚松弛变量，希望松弛变量接近0，即对误分类的惩罚增大，这样对训练集测试时准确率很高，但泛化能力弱。
   C值小，对误分类的惩罚减小，允许容错，将他们当成噪声点，泛化能力较强。

kernel ：核函数，默认是rbf，可以是‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ 
       0 – 线性：  u'v
       1 – 多项式：(gamma*u'*v + coef0)^degree
       2 – RBF函数：exp(-gamma|u-v|^2)
       3 –sigmoid： tanh(gamma*u'*v + coef0)
degree ：多项式poly函数的维度，默认是3，选择其他核函数时会被忽略。（没用）
gamma ： ‘rbf’,‘poly’ 和‘sigmoid’的核函数参数。默认是’auto’，则会选择1/n_features
coef0 ： 核函数的常数项。对于‘poly’和 ‘sigmoid’有用。（没用）
probability ：是否采用概率估计？.默认为False
shrinking ：  是否采用shrinking heuristic方法，默认为true
tol ：         停止训练的误差值大小，默认为1e-3
cache_size ：  核函数cache缓存大小，默认为200
class_weight ：类别的权重，字典形式传递。设置第几类的参数C为weight*C(C-SVC中的C)
verbose ：     允许冗余输出？
max_iter ：    最大迭代次数。-1为无限制。
decision_function_shape ：‘ovo’, ‘ovr’ or None, default=None3（选用ovr，一对多）
random_state ：  数据洗牌时的种子值，int值

主要调节的参数有：C、kernel、degree、gamma、coef0
'''

if __name__ == '__main__':
    train_data_path = "E:/data/raw/train_image/"
    dataMat, dataLabel = read_all_data(train_data_path)
    print("read data done!")

    st = time.time()
    model_path = 'E:/data/raw/model/svm.model'
    create_svm(dataMat, dataLabel, model_path, decision='ovr')
    et = time.time()
    print("Training spent {:.4f}s.".format((et - st)))

2.1 模型预测

所有测试集在同一个文件夹类

import time
import os
import joblib
from PIL import Image
import numpy as np


def get_img(path):
    return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".png")]

def img2vector(imgFile):
    img = Image.open(imgFile).convert('L')
    img_arr = np.array(img, 'i')                       # 28px * 28px 灰度图像
    img_normalization = np.round(img_arr / 255)        # 归一化
    img_arr2 = np.reshape(img_normalization, (1, -1))  # 1 * 400 矩阵
    return img_arr2

def svm_test(test_data_path,model_path):
    clf = joblib.load(model_path)                     # 加载模型
    img_list = get_img(test_data_path)

    t0 = time.time()
    f = open("E:/data/raw/pre_result.txt",'w')
    f.write('imgName,actual,pre_result,accuracy_rate,FPS')
    f.write("\n")

    error_count = 0
    for i in range(len(img_list)):
        img_path = img_list[i]
        dataLabel = img_path.split("/")[-1][0]
        dataMat = img2vector(img_path)
        preResult = clf.predict(dataMat)[0]
        print("num :"+str(i+1),"dataLabel :" ,dataLabel, " preResult:",preResult)
        if str(preResult) != dataLabel:
            error_count+=1

        accuracy_rate = (i + 1 - error_count) / (i + 1) * 100
        print("The accuracy rate is: ",accuracy_rate, "%")

        t1 = time.time()
        FPS = (i + 1) / (t1 - t0)
        print("FPS is:",FPS)
        f.write(",".join([img_path,dataLabel,str(preResult),str(accuracy_rate),str(FPS)]))
        f.write("\n")
    f.close()

if __name__ == '__main__':
    test_data_path = "E:/data/raw/test_image/"
    model_path = 'E:/data/raw/model/svm.model'
    svm_test(test_data_path,model_path)

在上述中预测准确率97.45%

三、模型优化

【关于网格搜索】【多特征融合】

3.1 多特征拼接

使用了投影特征：水平投影与垂直投影

# 增加了投影函数
def project(img_arr):
    height,width = img_arr.shape[:2]

    # 垂直投影：统计并存储每一列的白点数
    vertical = np.zeros((1,width),dtype=np.int32)
    for x in range(0, width):
        for y in range(0, height):
            if img_arr[y, x] != 0:
                vertical[0][x]+=1

    # 水平投影  #统计每一行的白点数
    horizontal = np.zeros((1,height),dtype=np.int32)
    for y in range(0, height):
        for x in range(0, width):
            if img_arr[y, x] != 0:
                horizontal[0][y] += 1
    vertical_norm = np.divide(vertical,height)
    horizontal_norm = np.divide(horizontal,width)
    return vertical_norm,horizontal_norm


# 修改了函数实现特征拼接
def img2vector(imgFile):
    _, thresh = cv2.threshold(cv2.imread(imgFile,0), 127, 255, cv2.THRESH_BINARY) # 28px * 28px 灰度图像
    img_normalization = np.round(thresh / 255)        # 归一化
    img_arr2 = np.reshape(img_normalization, (1, -1))  # 1 * 400 矩阵
    vertical_norm,horizontal_norm = project(thresh)
    img_arr3 = np.concatenate((img_arr2,vertical_norm),axis=1)
    img_arr4 = np.concatenate((img_arr3,horizontal_norm),axis=1)
    return

在上述1W张测试集中预测准确率97.67%左右，并没有提升。由于该特征与原特征的差异性导致了该问题。

3.2 使用多模型融合

采取多模型的权重分配决定最终预测结果。

trian.py

from PIL import Image
import os
import numpy as np
import time
from sklearn import svm
import joblib
import cv2


def get_img(path):
    return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".png")]


class Make_features():
    def __init__(self, imgFile):
        self.imgFile = imgFile
        _, thresh = cv2.threshold(cv2.imread(self.imgFile, 0), 127, 255, cv2.THRESH_BINARY)
        self.thresh = thresh


    def features_vertical(self):
        height, width = self.thresh.shape[:2]

        # 垂直投影：统计并存储每一列的白点数
        vertical = np.zeros((1, width), dtype=np.int32)
        for x in range(0, width):
            for y in range(0, height):
                if self.thresh[y, x] != 0:
                    vertical[0][x] += 1
        vertical_norm = np.divide(vertical, height)
        return vertical_norm

    def features_horizont(self):
        height, width = self.thresh.shape[:2]

        # 水平投影  #统计每一行的白点数
        horizont = np.zeros((1, height), dtype=np.int32)
        for y in range(0, height):
            for x in range(0, width):
                if self.thresh[y, x] != 0:
                    horizont[0][y] += 1
        horizont_norm = np.divide(horizont, width)
        return horizont_norm

    def features_img2vector(self):
        img_normalization = np.round(self.thresh / 255)  # 归一化
        img_arr4 = np.reshape(img_normalization, (1, -1))  # 1 * 400 矩阵
        return img_arr4



# 读取训练数据
def Read_All_Img_convert(train_data_path):
    imgFileList = get_img(train_data_path)
    dataNum = len(imgFileList)  # 所有图片
    dataLabel = np.zeros(dataNum, dtype=np.uint8)  # 存放类标签
    dataMat_imgarray = np.zeros((dataNum, 784))    # dataNum * 400 的矩阵(一行为一张图的数据)
    dataMat_vertical = np.zeros((dataNum, 28))
    dataMat_horizont = np.zeros((dataNum, 28))
    for i in range(dataNum):
        img_path = imgFileList[i]
        dataLabel[i] = img_path.split("/")[-1][0]  # 得到类标签(数字)
        dataMat_imgarray[i, :] = Make_features(img_path).features_img2vector()
        dataMat_vertical[i, :] = Make_features(img_path).features_vertical()
        dataMat_horizont[i, :] = Make_features(img_path).features_horizont()
    return dataMat_imgarray, dataMat_vertical,dataMat_horizont,dataLabel


class SVMs():
    def __init__(self,model_path,dataMat,dataLabel):
        self.model_path = model_path
        self.dataMat = dataMat
        self.dataLabel = dataLabel

    # create model
    def create_svm(self,decision='ovr'):
        clf = svm.SVC(C=1.0, kernel='rbf', decision_function_shape=decision, probability=True)
        rf = clf.fit(self.dataMat, self.dataLabel)
        joblib.dump(rf, self.model_path)


if __name__ == '__main__':
    train_data_path = "E:/data/raw/train_image/"
    dataMat_imgarray, dataMat_vertical,dataMat_horizont,dataLabel = Read_All_Img_convert(train_data_path)
    print("read data done!")


    model_path0 = 'E:/data/raw/model/svm4_0.model'
    model_path1 = 'E:/data/raw/model/svm4_1.model'
    model_path2 = 'E:/data/raw/model/svm4_2.model'

    t0 = time.time()
    SVMs(model_path=model_path0,dataMat=dataMat_imgarray,dataLabel=dataLabel).create_svm()
    t1 = time.time()
    print("Training spent {:.4f}s.".format((t1 - t0)))


    SVMs(model_path=model_path1,dataMat=dataMat_vertical,dataLabel=dataLabel).create_svm()
    t2 = time.time()
    print("Training spent {:.4f}s.".format((t2 - t1)))

    SVMs(model_path=model_path2,dataMat=dataMat_horizont,dataLabel=dataLabel).create_svm()
    t3 = time.time()
    print("Training spent {:.4f}s.".format((t3 - t2)))

test.py

import time
import os
import joblib
import numpy as np
import cv2


def get_img(path):
    return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".png")]


class Make_features():
    def __init__(self, imgFile):
        self.imgFile = imgFile
        _, thresh = cv2.threshold(cv2.imread(self.imgFile, 0), 127, 255, cv2.THRESH_BINARY)
        self.thresh = thresh


    def features_vertical(self):
        height, width = self.thresh.shape[:2]

        # 垂直投影：统计并存储每一列的白点数
        vertical = np.zeros((1, width), dtype=np.int32)
        for x in range(0, width):
            for y in range(0, height):
                if self.thresh[y, x] != 0:
                    vertical[0][x] += 1
        vertical_norm = np.divide(vertical, height)
        return vertical_norm

    def features_horizont(self):
        height, width = self.thresh.shape[:2]

        # 水平投影  #统计每一行的白点数
        horizont = np.zeros((1, height), dtype=np.int32)
        for y in range(0, height):
            for x in range(0, width):
                if self.thresh[y, x] != 0:
                    horizont[0][y] += 1
        horizont_norm = np.divide(horizont, width)
        return horizont_norm

    def features_img2vector(self):
        img_normalization = np.round(self.thresh / 255)  # 归一化
        img_arr4 = np.reshape(img_normalization, (1, -1))  # 1 * 400 矩阵
        return img_arr4


class SVM_TEST():
    def __init__(self,test_data_path,model_path0,model_path1,model_path2):
        self.model_path0 = model_path0
        self.model_path1 = model_path1
        self.model_path2 = model_path2
        self.test_data_path = test_data_path


    def svm_test(self):
        clf0 = joblib.load(self.model_path0)
        clf1 = joblib.load(self.model_path1)
        clf2 = joblib.load(self.model_path2)
        img_list = get_img(self.test_data_path)

        t0 = time.time()
        error_count = 0
        f = open("E:/data/raw/pre_result4_2.txt", 'w')
        f.write('imgName,actual,pre_result,accuracy_rate,FPS')
        f.write("\n")

        for i in range(len(img_list)):
            img_path = img_list[i]
            dataLabel = img_path.split("/")[-1][0]
            dataMat_imgarray = Make_features(img_path).features_img2vector()
            dataMat_vertical = Make_features(img_path).features_vertical()
            dataMat_horizont = Make_features(img_path).features_horizont()

            preResult0 = clf0.predict_proba(dataMat_imgarray)
            preResult1 = clf1.predict_proba(dataMat_vertical)
            preResult2 = clf2.predict_proba(dataMat_horizont)
            pre_labe = np.argmax(0.8*preResult0+0.1*preResult1+0.1*preResult2)

            if str(pre_labe) != dataLabel:
                error_count += 1

            accuracy_rate = (i + 1 - error_count) / (i + 1) * 100
            print("The accuracy rate is: ", accuracy_rate, "%")

            t1 = time.time()
            FPS = (i + 1) / (t1 - t0)
            print("FPS is:", FPS)
            f.write(",".join([img_path, dataLabel, str(pre_labe), str(accuracy_rate), str(FPS)]))
            f.write("\n")
        f.close()


if __name__ == '__main__':
    test_data_path = "E:/data/raw/test_image/"
    model_path0 = 'E:/data/raw/model/svm4_0.model'
    model_path1 = 'E:/data/raw/model/svm4_1.model'
    model_path2 = 'E:/data/raw/model/svm4_2.model'
    SVM_TEST(test_data_path,model_path0,model_path1,model_path2).svm_test()

在上述1W张测试集中预测准确率97.80%左右，并没有明显提升。其实我们可以提取更多的有效特征

其实投影的特征真心不行。

还要就是考虑到图像预处理速度，预测速度。特征也不宜过多。

我查看了原始数据，外国人的手写数据的一些奇怪写法：其实我们可以删除摸棱两可以及奇怪的字符，来提高准确率。例如：（数据集中的：0，1，2）

SVM 实现mnist 手写数字图像识别_核函数_02

附：skl2onnx

# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple skl2onnx
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple scikit-learn
import joblib
import onnxmltools
from skl2onnx.common.data_types import FloatTensorType
import numpy as np
import onnxruntime


def skl_to_onnx():
    input_skl_model = "../weight/svm_xxx.model"
    input_data_type = [('float_input', FloatTensorType([4, 22 * 32]))] # 输入数据维度[h,w]
    output_onnx_model = "../weight/svm_xxx.onnx"
    skl_model = joblib.load(input_skl_model)
    onnx_model = onnxmltools.convert_sklearn(skl_model, initial_types=input_data_type)
    onnxmltools.utils.save_model(onnx_model, output_onnx_model)



def start_svm():
    num2label_svm = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    x_input = np.load("../weight/svm_00GS.npy")
    svm_ort = onnxruntime.InferenceSession('../weight/svm_xxx.onnx')
    ort_inputs = {svm_ort.get_inputs()[0].name: x_input.astype(np.float32)}
    ort_outs = svm_ort.run(None, ort_inputs)
    print(len(ort_outs), ort_outs[0])
    pre_ys = "".join([num2label_IISIS[i] for i in ort_outs[0]])
    dst = "".join(pre_ys)


if __name__ == '__main__':
    skl_to_onnx()
    start_svm()