#Pytorch 使用attention实现转换日期并可视化attention
实现环境:python3.6
pytorch1.0

import json
from matplotlib import ticker
from numpy import *
from collections import Counter
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda')

对数据进行预处理:
首先从字符级层面统计字符的数量,然后将其转为字符对应数字的字典,最后再形成一个数字对应字符的字典。

def build_vocab(texts,n=None):
    counter = Counter(''.join(texts))     #字符级层的字典,Counter:是一个简单的计数器,例如,统计字符出现的个数:
    char2index = {w:i for i,(w,c) in enumerate(counter.most_common(n),start=4)}    #(w,c)对应着字符:出现的次数,most_common() 方法返回最常见的元素及其计数,顺序为最常见到最少
    char2index['~'] = 0  #pad,不足长度的文本在后面填充0,这里添加四种表示,也就是上面为什么从4开始的原因
    char2index['^'] = 1  #sos 表示句子的开头
    char2index['$'] = 2  #eos 表示句子的结尾
    char2index['#'] = 3  #unk 表示句子中出现字典中没有的未知词
    index2char = {i:w for w,i in char2index.items()}
    return char2index,index2char

数据下载链接:https://pan.baidu.com/s/132uS7mMzn7ISqEVg8i27eA 提取码:36fu

pairs = json.load(open('./data/Time Dataset.json','rt',encoding='utf-8'))
print(pairs[:2])   #查看一下数据的格式。
[['six hours and fifty five am', '06:55'], ['48 min before 10 a.m', '09:12']]

将目标文本和原文本分开,建立各自的字典

data = array(pairs)
src_texts = data[:,0]  #第一列的所有值
trg_texts = data[:,1]  #第二列的所有值
src_c2ix,src_ix2c = build_vocab(src_texts)
trg_c2ix,trg_ix2c = build_vocab(trg_texts)

接下来按批量更新,定义一个随机批量生成的函数,它能将文本转成字典中的数字表示,并同时返回batch_size个样本和它们的长度,这些样本按照长度降序排序。pad的长度以batch中最长的为准。这主要是为了适应pack_padded_sequence这个函数,因为输入RNN的序列不需要讲pad标志也输入RNN中计算,RNN只需要循环计算得到其真实长度即可。

def indexes_from_text(text,char2index):
    return [1] + [char2index[c] for c in text] + [2]
def pad_seq(seq,max_length):
    seq += [0 for _ in range(max_length - len(seq))]
    return seq

#第一个参数 function 以参数序列中的每一个元素调用 function 函数,返回包含每次 function 函数返回值的新列表。
max_src_len = max(list(map(len,src_texts)))+2   #map(function, iterable, ...)
max_trg_len = max(list(map(len,trg_texts)))+2
max_src_len,max_trg_len
(43, 7)
def random_batch(batch_size,pairs,src_c2ix,trg_c2ix):
    input_seqs,target_seqs = [],[]
    for i in random.choice(len(pairs),batch_size):
        input_seqs.append(indexes_from_text(pairs[i][0],src_c2ix))  #从随机的索引开始生成相应的text的index
        target_seqs.append(indexes_from_text(pairs[i][1],trg_c2ix))

    seq_pairs = sorted(zip(input_seqs,target_seqs),key=lambda p:len(p[0]),reverse=True)  #以key来排序,此处以input_seqs进行降序排序
    input_seqs,target_seqs = zip(*seq_pairs)  #与 zip 相反,*zipped 可理解为解压,返回二维矩阵式
    input_lengths = [len(s) for s in input_seqs]
    input_padded = [pad_seq(s,max(input_lengths)) for s in input_seqs]
    target_lengths = [len(s) for s in target_seqs]
    target_padded = [pad_seq(s,max(target_lengths)) for s in target_seqs]
    
    input_var = torch.LongTensor(input_padded).transpose(0,1) #torch.transpose(input, dim0, dim1, out=None) → Tensor,返回输入矩阵input的转置。交换维度dim0和dim1
    #得到的矩阵形状为seq_len*batch_size
    target_var = torch.LongTensor(target_padded).transpose(0,1)
    input_var = input_var.to(device)
    target_var = target_var.to(device)
    
    return input_var,input_lengths,target_var,target_lengths

“”"
sort 与 sorted 区别:
sort 是应用在 list 上的方法,sorted 可以对所有可迭代的对象进行排序操作。
list 的 sort 方法返回的是对已经存在的列表进行操作,无返回值,而内建函数 sorted 方法返回的是一个新的 list,而不是在原来的基础上进行的操作
sorted(iterable[, cmp[, key[, reverse]]])
key – 主要是用来进行比较的元素,只有一个参数,具体的函数的参数就是取自于可迭代对象中,指定可迭代对象中的一个元素来进行排序。
reverse – 排序规则,reverse = True 降序 , reverse = False 升序(默认)。
“”"测试batch_size = 3时是否能够正确输出

random_batch(3,data,src_c2ix,trg_c2ix)
(tensor([[ 1,  1,  1],
         [ 6, 23,  6],
         [ 5,  9, 18],
         [ 8, 23, 23],
         [ 4, 37,  9],
         [ 7,  4, 26],
         [33, 13, 23],
         [22,  9,  2],
         [30, 11,  0],
         [ 7,  9,  0],
         [22,  2,  0],
         [34,  0,  0],
         [ 4,  0,  0],
         [ 6,  0,  0],
         [31,  0,  0],
         [ 5,  0,  0],
         [ 8,  0,  0],
         [ 6,  0,  0],
         [20,  0,  0],
         [ 4,  0,  0],
         [13,  0,  0],
         [ 9,  0,  0],
         [11,  0,  0],
         [ 9,  0,  0],
         [ 2,  0,  0]], device='cuda:0'), [25, 11, 8], tensor([[ 1,  1,  1],
         [ 6,  5,  7],
         [ 5,  8,  8],
         [ 4,  4,  4],
         [ 7,  8,  5],
         [ 5, 12,  8],
         [ 2,  2,  2]], device='cuda:0'), [7, 7, 7])

模型:
这里的模型框架分为encoder和decoder两个部分,encoder部分比较简单,就是一层enbedding层加上两层GRU。
前面对于batch的格式处理,主要是为了处理pack_padded_sequence和pad_packer_sequence这两个类对GRU输入输出批量处理https://blog.csdn.net/lssc4205/article/details/79474735
https://blog.csdn.net/u012436149/article/details/79749409

class Encoder(nn.Module):
    def __init__(self,input_dim,embedding_dim,hidden_dim,num_layers=2,dropout=0.2):
        super().__init__()
        
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        #input_dim = vocab_size + 1
        self.embedding = nn.Embedding(input_dim,embedding_dim)
        
        self.rnn = nn.GRU(embedding_dim,hidden_dim,num_layers=num_layers,dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self,input_seqs,input_lengths,hidden=None):
        #src = [sent_len,batch_size]
        embedded = self.dropout(self.embedding(input_seqs))
        
        #embedded = [sent_len,batch_size,emb_dim]
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded,input_lengths)
        
        outputs,hidden = self.rnn(packed,hidden)
        outputs,output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        
        return outputs,hidden

outputs,hidden = self.rnn(embedded,hidden)
outputs = [sent_len,batch_size,hid_dim*n_directions]
hidden = [n_layers,batch_size,hid_dim]
outputs总是来自于最后一层
首先定义一下Attention层,这里主要是对encoder的输出进行attention操作,也可以直接对embedding层的输出进行attention。
论文Neural Machine Translation by Jointly Learning to Align and Translate中定义了attention的计算公式。

decoder的输出取决于decoder先前的输出和 pytorch中OFDM包_Seq2Seq, 这里 pytorch中OFDM包_Seq2Seq 包括当前GRU输出的hidden state(这部分已经考虑了先前的输出) 以及attention(上下文向量,由encoder的输出求得)。 计算公式如下:函数 pytorch中OFDM包_ci_03 非线性激活的全连接层,输入是 pytorch中OFDM包_可视化attention_04, pytorch中OFDM包_Seq2Seq_05, and pytorch中OFDM包_ci_06

pytorch中OFDM包_Seq2Seq_07

所谓的上下文向量就是对encoder的所有输出进行加权求和,pytorch中OFDM包_Attention机制_08 表示输出的第 i 个词对encoder第 j 个输出 pytorch中OFDM包_ci_09

pytorch中OFDM包_ci_10

每个 pytorch中OFDM包_Attention机制_08 通过对所有 pytorch中OFDM包_Attention机制_12 进行softmax,而每个 pytorch中OFDM包_Attention机制_12 是decoder的上一个hidden state pytorch中OFDM包_可视化attention_14 和指定的encoder的输出 pytorch中OFDM包_ci_09 经过某些线性操作 pytorch中OFDM包_ci_16

pytorch中OFDM包_pytorch中OFDM包_17

此外,论文Effective Approaches to Attention-based Neural Machine Translation中提出了计算分值的不同方式。这里用到的是第三种。

pytorch中OFDM包_pytorch中OFDM包_18

class Attention(nn.Module):
    def __init__(self,hidden_dim):
        super(Attention,self).__init__()
        self.hidden_dim = hidden_dim
        self.attn = nn.Linear(self.hidden_dim*2,hidden_dim)
        self.v = nn.Parameter(torch.rand(hidden_dim))
        self.v.data.normal_(mean=0,std=1./np.sqrt(self.v.size(0)))
        
    def forward(self,hidden,encoder_outputs):
        #encoder_outputs:(seq_len,batch_size,hidden_size)
        #hidden:(num_layers*num_directions,batch_size,hidden_size)
        max_len = encoder_outputs.size(0)
        
        h = hidden[-1].repeat(max_len,1,1)  #np.repeat(x, 3, axis=1) #沿着纵轴方向重复3次,增加列数
        #(seq_len,batch_size,hidden_size)
        
        attn_energies = self.score(h,encoder_outputs)   #计算attention score
        return F.softmax(attn_energies,dim=1)  #使用sofrmax归一化
    
    def score(self,hidden,encoder_outputs):
        #(seq_len,batch_size,2*hidden_size->(seq_len,batch_size,hidden_size))
        energy = F.tanh(self.attn(torch.cat([hidden,encoder_outputs],2)))
        energy = energy.permute(1,2,0)  #(batch_size,hidden_size,seq_len):permute实现了0维的到2维上,1->0,2->1
        v = self.v.repeat(encoder_outputs.size(1),1).unsqueeze(1)  #(batch_size,1,hidden_size):repeat()
        energy = torch.bmm(v,energy)  #(batch_size,1,seq_len):torch.matmul和torch.bmm,都能实现对于batch的矩阵乘法:
        return energy.squeeze(1)   #(batch_size,seq_len):

接下来是加入attention层的decoder,GRU的输出进入全连接层后,又进行了log_softmax操作计算输出词的概率,主要是为了方便NLLLoss损失函数,如果用CrossEntropyLoss损失函数,可以不用加softmax:损失函数NLLLoss() 的 输入 是一个对数概率向量和一个目标标签. 它不会为我们计算对数概率,适合最后一层是log_softmax()的网络. 损失函数 CrossEntropyLoss() 与 NLLLoss() 类似, 唯一的不同是它为我们去做 softmax.可以理解为:CrossEntropyLoss()=log_softmax() + NLLLoss()

class Decoder(nn.Module):
    def __init__(self,output_dim,embedding_dim,hidden_dim,num_layers=2,dropout=0.2):
        super().__init__()
        self.output_dim = output_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(output_dim,embedding_dim)
        self.attention = Attention(hidden_dim)
        self.rnn = nn.GRU(embedding_dim+hidden_dim,hidden_dim,num_layers=num_layers,dropout=dropout)
        self.out = nn.Linear(embedding_dim+hidden_dim*2,output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self,input,hidden,encoder_outputs):
        #input = [bsz]
        #hidden = [n_layer*n_direction,batch_size,hid_dim]
        #encoder_outputs = [sent_len,batch_size,hid_dim*n_direction]
        input = input.unsqueeze(0)
        #input = [1,bsz]
        embedded = self.dropout(self.embedding(input))
        #emdedded = [1,bsz,emb_dim]
        attn_weight = self.attention(hidden,encoder_outputs)
        #(batch_size,seq_len)
        context = attn_weight.unsqueeze(1).bmm(encoder_outputs.transpose(0,1)).transpose(0,1)
        #(batch_size,1,hidden_dim*n_directions)
        #(1,batch_size,hidden_dim*n_directions)
        emb_con = torch.cat((embedded,context),dim=2)
        #emb_con = [1,bsz,emb_dim+hid_dim]
        _,hidden = self.rnn(emb_con,hidden)
        #outputs = [sent_len,batch_size,hid_dim*n_directions]
        #hidden = [n_layers*n_direction,batch_size,hid_dim]
        output = torch.cat((embedded.squeeze(0),hidden[-1],context.squeeze(0)),dim=1)
        output = F.log_softmax(self.out(output),1)
        #outputs = [sent_len,batch_size,vocab_size]
        return output,hidden,attn_weight

我们定义一个Seq2Seq类,将encoder和decoder结合起来,通过一个循环,模型对每一个batch从前往后依次生成序列,训练的时候可以使用teacher_forcing随机使用真实词或是模型输出的词作为target,测试的时候就不需要了。

class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder,device,teacher_forcing_ratio=0.5):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.teacher_forcing_ratio = teacher_forcing_ratio
    
    def forward(self,src_seqs,src_lengths,trg_seqs):
        #src_seqs = [sent_len,batch_size]
        #trg_seqs = [sent_len,batch_size]
        batch_size = src_seqs.shape[1]
        max_len = trg_seqs.shape[0]
        trg_vocab_size = self.decoder.output_dim
        #建立一个tensor来保存decoder的输出
        outputs = torch.zeros(max_len,batch_size,trg_vocab_size).to(self.device)
        #hidden用于decoder的初始hidden的状态
        #encoder_outputs用于计算上下文向量
        encoder_outputs,hidden = self.encoder(src_seqs,src_lengths)
        #decoder的第一个输入是<sos>
        output = trg_seqs[0,:]
        for t in range(1,max_len): #这里跳过sos
            output,hidden,_ = self.decoder(output,hidden,encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < self.teacher_forcing_ratio
            output = (trg_seqs[t] if teacher_force else output.max(1)[1])
        return outputs
    
    def predict(self,src_seqs,src_lengths,max_trg_len=20,start_ix=1):
        max_src_len = src_seqs.shape[0]
        batch_size = src_seqs.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(max_trg_len,batch_size,trg_vocab_size).to(self.device)
        encoder_outputs,hidden = self.encoder(src_seqs,src_lengths)
        output = torch.LongTensor([start_ix]*batch_size).to(self.device)    #这里初始化一个batch的一步输出大小的tensor
        attn_weights = torch.zeros((max_trg_len,batch_size,max_src_len))
        for t in range(1,max_trg_len):
            output,hidden,attn_weight = self.decoder(output,hidden,encoder_outputs)
            outputs[t] = output
            output = output.max(1)[1]
            attn_weights[t] = attn_weight
        return outputs,attn_weights

模型训练:
直接使用1000个batch进行更新

import torch.optim as optim

embedding_dim = 100
hidden_dim = 100
batch_size = 256
clip = 5

encoder = Encoder(len(src_c2ix)+1,embedding_dim,hidden_dim)
decoder = Decoder(len(trg_c2ix)+1,embedding_dim,hidden_dim)
model = Seq2Seq(encoder,decoder,device).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.NLLLoss(ignore_index=0).to(device)

model.train()

for batch_id in range(1,1001):
    src_seqs,src_lengths,trg_seqs,_ = random_batch(batch_size,pairs,src_c2ix,trg_c2ix)
    
    optimizer.zero_grad()
    output = model(src_seqs,src_lengths,trg_seqs)
    loss = criterion(output.view(-1,output.shape[2]),trg_seqs.view(-1))
    loss.backward()
    #
    torch.nn.utils.clip_grad_norm_(model.parameters(),clip)  #既然在BP过程中会产生梯度消失/爆炸(就是偏导无限接近0,导致长时记忆无法更新),那么最简单粗暴的方法,设定阈值,当梯度小于/大于阈值时,更新的梯度为阈值
    optimizer.step()    #
    
    if batch_id % 100 == 0:
        print('current loss:{:.4f}'.format(loss))

torch.save(model, 'model.pth')
current loss:0.8211
current loss:0.3182
current loss:0.2070
current loss:0.1032
current loss:0.0706
current loss:0.0345
current loss:0.0343
current loss:0.0215
current loss:0.0108
current loss:0.0169


c:\users\administrator\appdata\local\programs\python\python36\lib\site-packages\torch\serialization.py:256: UserWarning: Couldn't retrieve source code for container of type Seq2Seq. It won't be checked for correctness upon loading.
  "type " + obj.__name__ + ". It won't be checked "
c:\users\administrator\appdata\local\programs\python\python36\lib\site-packages\torch\serialization.py:256: UserWarning: Couldn't retrieve source code for container of type Encoder. It won't be checked for correctness upon loading.
  "type " + obj.__name__ + ". It won't be checked "
c:\users\administrator\appdata\local\programs\python\python36\lib\site-packages\torch\serialization.py:256: UserWarning: Couldn't retrieve source code for container of type Decoder. It won't be checked for correctness upon loading.
  "type " + obj.__name__ + ". It won't be checked "
c:\users\administrator\appdata\local\programs\python\python36\lib\site-packages\torch\serialization.py:256: UserWarning: Couldn't retrieve source code for container of type Attention. It won't be checked for correctness upon loading.
  "type " + obj.__name__ + ". It won't be checked "

进行测试:
主要实验可视化attention权重

def show_attention(input_words,output_words,attentions):
    plt.rcParams['savefig.dpi'] = 300 #图片像素
    plt.rcParams['figure.dpi'] = 300 #分辨率
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions,cmap='bone') #可视化矩阵
    fig.colorbar(cax)
    
    #设置axes
    ax.set_xticklabels(['']+input_words)
    ax.set_yticklabels(['']+output_words)
    
    #以每一个刻度显示label
    ax.xaxis.set_major_locator(ticker.MultipleLocator())
    ax.yaxis.set_major_locator(ticker.MultipleLocator())
    
    plt.show()
    plt.close()

def evaluate(model,text,src_c2ix,trg_ix2c):
    model.eval()
    with torch.no_grad():
        seq = torch.LongTensor(indexes_from_text(text,src_c2ix)).view(-1,1).to(device)
        outputs,attn_weights = model.predict(seq,[seq.size(0)],max_trg_len)
        outputs = outputs.squeeze(1).cpu().numpy()
        attn_weights = attn_weights.squeeze(1).cpu().numpy()
        output_words = [trg_ix2c[np.argmax(word_prob)] for word_prob in outputs]
        show_attention(list('^'+text+'$'),output_words,attn_weights)
text = 'thirsty 1 before 3 clock affternoon'
evaluate(model,text,src_c2ix,trg_ix2c)

pytorch中OFDM包_ci_19

text = 'forty seven min before 10 p.m'
evaluate(model,text,src_c2ix,trg_ix2c)

pytorch中OFDM包_可视化attention_20