pytorch无报错中断 pytorch checkpoint

转载

索姆拉 2023-11-30 10:29:46

文章标签 pytorch无报错中断 pytorch模型加载参数加载 文章分类 PyTorch 人工智能

1.Pytorch 模型保存与加载，并在加载的模型基础上继续训练

只保存参数；（官方推荐）

torch.save(model.state_dict(), path) #只保存参数 

#特别地，如果还想保存某一次训练采用的优化器、epochs等信息，可将这些信息组合
#起来构成一个字典，然后将字典保存起来
state = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch}
torch.save(state, path)

#加载 
model.load_state_dict(torch.load(path))

#针对上述第二种以字典形式保存的方法，加载方式如下
checkpoint = torch.load(path)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
epoch = checkpoint(['epoch'])

ps:只保存参数的方法在加载的时候要事先定义好跟原模型一致的模型，并在该模型的实例对象(假设名为model)上进行加载，即在使用上述加载语句前已经有定义了一个和原模型一样的Net, 并且进行了实例化 model=Net( )

下面给出一个具体的例子程序，该程序只保存最新的参数

#-*- coding:utf-8 -*-

import torch as torch
import torchvision as tv
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from torchvision.transforms import ToPILImage
import torch.backends.cudnn as cudnn
import datetime
import argparse

# 参数声明
batch_size = 32
epochs = 10
WORKERS = 0   # dataloder线程数
test_flag = True  #测试标志，True时加载保存好的模型进行测试 
ROOT = '/home/pxt/pytorch/cifar'  
log_dir = '/home/pxt/pytorch/logs/cifar_model.pth'  # 模型保存路径

# 加载MNIST数据集
transform = tv.transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

train_data = tv.datasets.CIFAR10(root=ROOT, train=True, download=True, transform=transform)
test_data = tv.datasets.CIFAR10(root=ROOT, train=False, download=False, transform=transform)

train_load = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=WORKERS)
test_load = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=WORKERS)


# 构造模型
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, 3, padding=1)
        self.conv4 = nn.Conv2d(256, 256, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(256 * 8 * 8, 1024)
        self.fc2 = nn.Linear(1024, 256)
        self.fc3 = nn.Linear(256, 10)
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = F.relu(self.conv3(x))
        x = self.pool(F.relu(self.conv4(x)))
        x = x.view(-1, x.size()[1] * x.size()[2] * x.size()[3])
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


device = torch.device("cuda" if use_cuda else "cpu")
#model = Net().cuda()
model = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)


# 模型训练
def train(model, train_loader, epoch):
    model.train()
    train_loss = 0
    for i, data in enumerate(train_loader, 0):
        x, y = data
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        y_hat = model(x)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss
    loss_mean = train_loss / (i+1)
    print('Train Epoch: {}t Loss: {:.6f}'.format(epoch, loss_mean.item()))

# 模型测试
def test(model, test_loader):
    model.eval（)
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            x, y = data
            x = x.cuda()
            y = y.cuda()
            optimizer.zero_grad()
            y_hat = model(x)
            test_loss += criterion(y_hat, y).item()
            pred = y_hat.max(1, keepdim=True)[1]
            correct += pred.eq(y.view_as(pred)).sum().item()
        test_loss /= (i+1)
        print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)n'.format(
            test_loss, correct, len(test_data), 100. * correct / len(test_data)))


def main():

    # 如果test_flag=True,则加载已保存的模型
    if test_flag:
        # 加载保存的模型直接进行测试机验证，不进行此模块以后的步骤
        checkpoint = torch.load(log_dir)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        epochs = checkpoint['epoch']
        test(model, test_load)
        return

    for epoch in range(0, epochs):
        train(model, train_load, epoch)
        test(model, test_load)
        # 保存模型
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        torch.save(state, log_dir)

if __name__ == '__main__':
    main()

在加载的模型基础上继续训练

在训练模型的时候可能会因为一些问题导致程序中断，或者常常需要观察训练情况的变化来更改学习率等参数，这时候就需要加载中断前保存的模型，并在此基础上继续训练，这时候只需要对上例中的 main() 函数做相应的修改即可，修改后的 main() 函数如下：

def main():

    # 如果test_flag=True,则加载已保存的模型
    if test_flag:
        # 加载保存的模型直接进行测试机验证，不进行此模块以后的步骤
        checkpoint = torch.load(log_dir)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        test(model, test_load)
        return

    # 如果有保存的模型，则加载模型，并在其基础上继续训练
    if os.path.exists(log_dir):
        checkpoint = torch.load(log_dir)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        print('加载 epoch {} 成功！'.format(start_epoch))
    else:
        start_epoch = 0
        print('无保存模型，将从头开始训练！')

    for epoch in range(start_epoch+1, epochs):
        train(model, train_load, epoch)
        test(model, test_load)
        # 保存模型
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        torch.save(state, log_dir)

2. tqdm 进度条 pip install tpdm

import time
from tqdm import *

for i in tqdm(range(1000)): # 传入可迭代对象
    time.sleep(.01)


pbar = tqdm(["a", "b", "c", "d"])
for char in pbar: # 可以为进度条设置描述
    # 设置描述
    pbar.set_description("Processing %s" % char)
    time.sleep(1)

3. lmdb pip install lmdb

#LMDB的全称是Lightning Memory-Mapped Database(快如闪电的内存映射数据库)
import lmdb
import os, sys

def initialize():
    env = lmdb.open("lmdb_dir")
    return env

def insert(env, sid, name):
    txn = env.begin(write=True)
    txn.put(str(sid).encode(), name.encode())
    txn.commit()

def delete(env, sid):
    txn = env.begin(write=True)
    txn.delete(str(sid).encode())
    txn.commit()

def update(env, sid, name):
    txn = env.begin(write=True)
    txn.put(str(sid).encode(), name.encode())
    txn.commit()

def search(env, sid):
    txn = env.begin()
    name = txn.get(str(sid).encode()).decode()
    return name

def display(env):
    txn = env.begin()
    cur = txn.cursor()
    for key, value in cur:
        print(key.decode(), value.decode())


env = initialize()

print("Insert 3 records.")
insert(env, 1, "Alice")
insert(env, 2, "Bob")
insert(env, 3, "Peter")
display(env)

print("Delete the record where sid = 1.")
delete(env, 1)
display(env)

print("Update the record where sid = 3.")
update(env, 3, "Mark")
display(env)

print("Get the name of student whose sid = 3.")
name = search(env, 3)
print(name)

#------------------------------------
"""
Insert 3 records.
1 Alice
2 Bob
3 Peter
Delete the record where sid = 1.
2 Bob
3 Peter
Update the record where sid = 3.
2 Bob
3 Mark
Get the name of student whose sid = 3.
Mark
"""

3. PyYaml pip install pyyaml

pytorch无报错中断 pytorch checkpoint_pytorch无报错中断

# -*- coding:utf-8 -*-
import yaml

#https://www.zhihu.com/column/c_1244271799468482560
f = open(r'dataconfig.yml')
dic = yaml.load(f, Loader=yaml.FullLoader)
print (type(dic), dic)  # type(dic)  <class 'dict'>


dic2 = {'name': 'Tom Smith', 'age': 37, 'spouse': {'name': 'Jane Smith', 'age': 25},
        'children': [{'name': 'Jimmy Smith', 'age': 15},
                     {'name1': 'Jenny Smith', 'age1': 12}]}
f_s = open(r'dataconfig2.yml','w')
yaml.dump(dic2, f_s)

4. Pytorch中的Conv1d()和Conv2d()函数

pytorch无报错中断 pytorch checkpoint_pytorch无报错中断_02

pytorch无报错中断 pytorch checkpoint_pytorch模型加载参数_03

pytorch无报错中断 pytorch checkpoint_pytorch模型加载参数_04

一般来说，一维卷积nn.Conv1d()用于文本数据，只对宽度进行卷积，对高度不卷积。通常，输入大小为word_embedding_dim * max_sent_length，其中，word_embedding_dim为词向量的维度，max_sent_length为句子的最大长度。卷积核窗口在句子长度的方向上滑动，进行卷积操作。

输入：批大小为50，句子的最大长度为35，词向量维度为300

目标：句子分类，共2类

import torch
import torch.nn as nn

# max_sent_len=35, batch_size=50, embedding_size=300
conv1 = nn.Conv1d(in_channels=300, out_channels=100, kernel_size=3)
input = torch.randn(50, 35, 300)
# batch_size x max_sent_len x embedding_size -> batch_size x embedding_size x max_sent_len
input = input.permute(0, 2, 1)
print("input:", input.size())
output = conv1(input)
print("output:", output.size())
# 最大池化
pool1d = nn.MaxPool1d(kernel_size=35-3+1)
pool1d_value = pool1d(output)
print("最大池化输出：", pool1d_value.size())
# 全连接
fc = nn.Linear(in_features=100, out_features=2)
fc_inp = pool1d_value.view(-1, pool1d_value.size(1))
print("全连接输入：", fc_inp.size())
fc_outp = fc(fc_inp)
print("全连接输出：", fc_outp.size())
# softmax
m = nn.Softmax()
out = m(fc_outp)
print("输出结果值：", out)

原始输入大小为(50, 35, 300)，经过permute(0, 2, 1)操作后，输入的大小变为(50, 300, 35)；
使用1个window_size为3的卷积核进行卷积，因为一维卷积是在最后维度上扫的，最后output的大小即为：50*100*（35-3+1）=50*100*33
output经过最大池化操作后，得到了数据维度为：(50,100,1)
经过（输入特征=100，输出特征=2）的全连接层，数据维度就变为了：(50，2)
再经过softmax函数就得到了属于两个类别的概率值

eg: 假设 embedding_size=256, feature_size=100, window_sizes=[3,4,5,6], max_text_len=35

class TextCNN(nn.Module):
    def __init__(self, config):
        super(TextCNN, self).__init__()
        self.is_training = True
        self.dropout_rate = config.dropout_rate
        self.num_class = config.num_class
        self.use_element = config.use_element
        self.config = config
 
        self.embedding = nn.Embedding(num_embeddings=config.vocab_size, 
                                embedding_dim=config.embedding_size)
        self.convs = nn.ModuleList([
                nn.Sequential(nn.Conv1d(in_channels=config.embedding_size, 
                                        out_channels=config.feature_size, 
                                        kernel_size=h),
#                              nn.BatchNorm1d(num_features=config.feature_size), 
                              nn.ReLU(),
                              nn.MaxPool1d(kernel_size=config.max_text_len-h+1))
                     for h in config.window_sizes ])

        self.fc = nn.Linear(in_features=config.feature_size*len(config.window_sizes),
                            out_features=config.num_class)
        if os.path.exists(config.embedding_path) and config.is_training and config.is_pretrain:
            print("Loading pretrain embedding...")
            self.embedding.weight.data.copy_(torch.from_numpy(np.load(config.embedding_path)))    
    
    def forward(self, x):
        embed_x = self.embedding(x)
        
        #print('embed size 1',embed_x.size())  # 32*35*256
# batch_size x text_len x embedding_size  -> batch_size x embedding_size x text_len
        embed_x = embed_x.permute(0, 2, 1)
        #print('embed size 2',embed_x.size())  # 32*256*35
        out = [conv(embed_x) for conv in self.convs]  #out[i]:batch_size x feature_size*1
        #for o in out:
        #    print('o',o.size())  # 32*100*1
        out = torch.cat(out, dim=1)  # 对应第二个维度（行）拼接起来，比如说5*2*1,5*3*1的拼接变成5*5*1
        #print(out.size(1)) # 32*400*1
        out = out.view(-1, out.size(1)) 
        #print(out.size())  # 32*400 
        if not self.use_element:
            out = F.dropout(input=out, p=self.dropout_rate)
            out = self.fc(out)
        return out

基于上述代码，具体计算过程如下：