根据《深度学习入门》第八章推荐的网络,用Pytorch编写实现对Minist数据集的识别,识别精度超过99.4%。以下是自己编写的代码:

网络:

import torch.nn as nn
import torch
import numpy as np


# 定义一个卷积网络
class My_nn_s(nn.Module):
    def __init__(self):
        super(My_nn_s, self).__init__()
        self.model = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1, stride=1),
                                   #  输出(N, C, OH, OW) = (N, 16, 28, 28)
                                   nn.ReLU(),
                                   nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1, stride=1),
                                   #  输出(N, C, OH, OW) = (N, 16, 28, 28)
                                   nn.ReLU(),
                                   nn.MaxPool2d(2, 2, 0),
                                   #  输出(N, C, OH, OW) = (N, 16, 14, 14)
                                   nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1, stride=1),
                                   #  输出(N, C, OH, OW) = (N, 32, 14, 14)
                                   nn.ReLU(),
                                   nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=2, stride=1),
                                   #  输出(N, C, OH, OW) = (N, 32, 16, 16)
                                   nn.ReLU(),
                                   nn.MaxPool2d(2, 2, 0),
                                   #  输出(N, C, OH, OW) = (N, 32, 8, 8)
                                   nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1, stride=1),
                                   #  输出(N, C, OH, OW) = (N, 64, 8, 8)
                                   nn.ReLU(),
                                   nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1, stride=1),
                                   #  输出(N, C, OH, OW) = (N, 64, 8, 8)
                                   nn.ReLU(),
                                   nn.MaxPool2d(2, 2, 0),
                                   #  输出(N, C, OH, OW) = (N, 64, 4, 4)  # pytorch除不尽为舍去
                                   nn.Flatten(),
                                   #  输出(N, C, OH, OW) = (N, 1024)
                                   nn.Linear(1024, 50),
                                   nn.ReLU(),
                                   nn.Dropout(),
                                   nn.Linear(50, 10),
                                   nn.Dropout(),
                                   )

        # 初始化权重,W采用He初始值,而B初始值为0

        para_size_list = []
        # 初始化B
        for layer in self.model:
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                # 为后面更新W准备参数
                para_size = layer.weight.shape
                num_weight = 1  # 初始化权重参数的数量
                if isinstance(layer, nn.Conv2d):
                    for i in range(len(para_size) - 1):
                        num_weight = num_weight * para_size[i + 1]
                else:
                    num_weight = para_size[-1]
                para_size_list.append(num_weight)

                # 更新W
                para_size = layer.bias.shape
                layer.bias.data = torch.from_numpy(np.zeros(*para_size))
                layer.bias.data = (layer.bias.data).type(torch.float32)  # torch默认的参数类型为float32类型

        # 初始化W
        para_size_list.insert(0, 1 * 3 * 3)  # 在开头插入输入图片维度,教材为1*3*3,为什么不是1*28*28??
        wight_init_scales = np.sqrt(2.0 / np.array(para_size_list))  # He初始值
        i = 0
        for layer in self.model:
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                # 初始化W
                para_size = layer.weight.shape
                layer.weight.data = torch.from_numpy(np.random.randn(*para_size)) * wight_init_scales[i]
                layer.weight.data = (layer.weight.data).type(torch.float32)  # torch默认的参数类型为float32类型
                i += 1

    def forward(self, x):
        x = self.model(x)
        return x


if __name__ == '__main__':
    my_nn = My_nn_s()
    # print(my_nn)

这里采用“He初始值”对权重参数W进行初始化,这里有个问题,即第一层(卷积层)的n为什么是1*3*3而不是图片的节点数即1*28*28。

另外,“He初始值”是规定前一层的节点是定义本层的初始权重,但有些文章却用本层PyTorch参数初始化和Finetune - 知乎

训练代码:

# 建立神经网络顺序:
# 1、准备数据集,提取数据集的长度
# 2、利用DataLoader加载数据集
# 3、创建网络模型
# 4、定义损失函数
# 5、定义优化器(定义学习率等)
# 6、设置训练网络的参数--(记录训练次数total_train_step, 测试的次数total_test_step
#    训练的轮数epoch)
# 7、添加tensorboard
# 8、训练模型(类型.train())和评价模型 (类型.eval()),训练完每一轮后保存模型

# 注意,每训练完一轮可以和测试数据进行比较,用到with torch.no_grad(),即预测时关闭梯度


import torch
import torchvision
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from net_work import *  # 网络位于当前文件夹中的net_work.py

# 1、准备数据
# data_train = torchvision.datasets.CIFAR10('./dataset', train=True, transform=torchvision.transforms.ToTensor(),
#                                           download=True)
#
# data_test = torchvision.datasets.CIFAR10('./dataset', train=False, transform=torchvision.transforms.ToTensor(),
#                                          download=True)
data_train = torchvision.datasets.MNIST('./dataset', train=True, transform=torchvision.transforms.ToTensor(),
                                          download=True)

data_test = torchvision.datasets.MNIST('./dataset', train=False, transform=torchvision.transforms.ToTensor(),
                                         download=True)

#  设备定义,如果能用GPU则用GPU进行运算
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# 分别在网络,损失函数和数据处用.to(device)

# 1.1、记录数据长度,用于后面计算正确率
len_train_data = len(data_train)  # 50000
print('训练集的长度为: {}'.format(len_train_data))
len_test_data = len(data_test)  # 10000
print('测试集的长度为: {}'.format(len_test_data))

# 2、利用DataLoader加载数据集
data_train_loader = DataLoader(data_train, batch_size=100, drop_last=True, shuffle=True)
data_test_loader = DataLoader(data_test, drop_last=True)

# 3、创建网络模型
my_nn = My_nn_s()
my_nn = my_nn.to(device)  # 选择CPU或者GPU进行运算

# 4、定义损失函数
my_loss = nn.CrossEntropyLoss()  # 交叉熵误差
my_loss = my_loss.to(device)  # 选择CPU或者GPU进行运算

# 5、定义优化器(定义学习率等)
learn_rate = 0.001  # 学习率
# optimer = torch.optim.SGD(my_nn.parameters(), lr=learn_rate)  # 传入参数和学习率
optimer = torch.optim.Adam(my_nn.parameters(), lr=learn_rate)
# optimer = torch.optim.Adam(my_nn.parameters(), lr=learn_rate, betas=(0.9, 0.999))
# 6、设置训练网络的参数--(记录训练次数total_train_step, 测试的次数total_test_step
#    训练的轮数epoch)
epoch = 20
total_train_step = 0
total_test_step = 0

# 7、添加tensorboard
writer = SummaryWriter('train_test')

# 8、训练模型(类型.train())和评价模型 (类型.eval())
for i_epoch in range(epoch):
    print('----------第{}轮训练开始----------'.format(i_epoch + 1))
    # 8.1 训练开始
    my_nn.train()  # 开启训练模式,针对特定网络层有用,例如dropout, batchnormal
    for data in data_train_loader:
        imgs, targets = data  # 读取训练数据和标签
        imgs = imgs.to(device)  # 选择CPU或者GPU进行运算
        targets = targets.to(device)  # 选择CPU或者GPU进行运算

        optimer.zero_grad()  # 重置梯度
        predict = my_nn(imgs)  # 正向传播,获取结果
        loss = my_loss(predict, targets)  # 计算损失函数
        loss.backward()  # 使用损失函数进行反向传播
        optimer.step()  # 使用优化器更新梯度
        total_train_step += 1  # 更新总的训练次数
        accuracy_test = sum(predict.argmax(1) == targets).item() / len(targets)  # .item将tensor类型转为数据类型,
        # .argmax(1)表示按行求最大值,生成一个列向量(与1对应);同理,.argmax(0)则为按列求最大值,生成行向量(与0对应)

        if (total_train_step + 1) % 100 == 0:
            print('训练次数:{},Loss: {}, Accuracy: {}%'.format(total_train_step + 1, loss.item(), accuracy_test * 100))
            writer.add_scalar('train_loss', loss.item(), total_train_step)
            writer.add_scalar('train_accuracy', accuracy_test * 100, total_train_step)

    # 8.2 测试开始
    my_nn.eval()  # 开启测试模式,针对特定网络层有用,例如dropout, batchnormal
    total_test_loss = 0.0  # 用于记录测试数据集在整体的损失函数
    total_test_accuracy = 0.0  # # 用于记录测试数据集在整体的预测精度
    with torch.no_grad():  # with后面的代码没有梯度
        for data_1 in data_test_loader:
            imgs_1, targets_1 = data_1
            imgs_1 = imgs_1.to(device)  # 选择CPU或者GPU进行运算
            targets_1 = targets_1.to(device)  # 选择CPU或者GPU进行运算

            predict = my_nn(imgs_1)  # 正向传播,获取结果
            loss = my_loss(predict, targets_1)  # 计算损失函数
            total_test_loss += loss.item()  # 累积一轮的损失值
            total_test_step += 1  # 更新总的测试次数
            accuracy_test = sum(predict.argmax(1) == targets_1).item()
            # accuracy_test = (predict.argmax(1) == targets_1).sum.item() / len(targets_1)
            total_test_accuracy += accuracy_test
    print('整体测试集上的Loss: {}'.format(total_test_loss))
    print('整体测试集上的正确率: {}'.format(total_test_accuracy / len_test_data * 100))
    writer.add_scalar('test_loss', loss.item(), total_test_step)

    # 8.3 保存结果
    torch.save(my_nn,
               './net_work_save/My_nn_{}_ac={}%.pth'.format(i_epoch + 1, total_test_accuracy / len_test_data * 100))
    # torch.save(my_nn.state_dict(), './net_work_save/My_nn_{}.pth'.format(i_epoch + 1))  官方推荐保存方法
    print('模型已保存')

writer.close()

这里学习率为0.001,采用基于Adam的最优化。注意,当采用Adam时学习率太大(0.1)无法正常学习。

另外,由于Dropout层的存在,在训练层上的精度没法正常显示(远低于在测试集上的精度)。