接下来,我们就分别介绍编码器和解码器神经网络的构建。

编码器网络

我们的编码器网络是采用双向GRU单元构造的一个两层RNN,代码如下:

# 构建编码器RNN
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        # 第一层Embeddeing
        self.embedding = nn.Embedding(input_size, hidden_size)
        # 第二层GRU。注意GRU中可以定义很多层,主要靠num_layers控制
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first = True,
                          num_layers = self.n_layers, bidirectional = True)

    def forward(self, input, hidden):
        # 前馈过程
        # input尺寸:batch_size, length_seq
        embedded = self.embedding(input)
        # embedded尺寸:batch_size, length_seq, hidden_size
        output = embedded
        output, hidden = self.gru(output, hidden)
        # output尺寸:batch_size, length_seq, hidden_size
        # hidden尺寸:num_layers * directions, batch_size, hidden_size
        return output, hidden

    def initHidden(self, batch_size):
        # 对隐含单元变量全部进行初始化
        # num_layers * num_directions, batch, hidden_size
        result = torch.zeros(self.n_layers * 2, batch_size, self.hidden_size)
        if use_cuda:
            return result.cuda()
        else:
            return result

在这段代码中,我们定义了编码器网络的结构,它的隐含单元是一个双向GRU网络。只需要设置bidirectional = True就可以轻松实现双向GRU了。其他操作则跟一般的RNN没有任何区别。

注意力解码器网络

接下来,我们实现一个带有注意力机制的解码器网络,代码如下:

# 定义带有注意力机制的解码器RNN
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.max_length = max_length

        # 词嵌入层
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)

        # 注意力网络(一个前馈神经网络)
        self.attn = nn.Linear(self.hidden_size * (2 * n_layers + 1), self.max_length)

        # 将注意力机制作用之后的结果映射到后面的层
        self.attn_combine = nn.Linear(self.hidden_size * 3, self.hidden_size)

        # dropout操作层
        self.dropout = nn.Dropout(self.dropout_p)
 
 
        # 定义一个双向GRU,并设置batch_first为True以方便操作
        self.gru = nn.GRU(self.hidden_size, self.hidden_size, bidirectional = True,
                          num_layers = self.n_layers, batch_first = True)
        self.out = nn.Linear(self.hidden_size * 2, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        # 解码器的一步操作
        # input大小:batch_size, length_seq
        embedded = self.embedding(input)
        # embedded大小:batch_size, length_seq, hidden_size
        embedded = embedded[:, 0, :]
        # embedded大小:batch_size, hidden_size
        embedded = self.dropout(embedded)

        # 将hidden张量数据转化成batch_size排在第0维的形状
        # hidden大小:direction*n_layer, batch_size, hidden_size
        temp_for_transpose = torch.transpose(hidden, 0, 1).contiguous()
        temp_for_transpose = temp_for_transpose.view(temp_for_transpose.size()[0], -1)
        hidden_attn = temp_for_transpose

        # 注意力层的输入
        # hidden_attn大小:batch_size, direction*n_layers*hidden_size
        input_to_attention = torch.cat((embedded, hidden_attn), 1)
        # input_to_attention大小:batch_size, hidden_size * (1 + direction * n_layers)

        # 注意力层输出的权重
        attn_weights = F.softmax(self.attn(input_to_attention))
        # attn_weights大小:batch_size, max_length

        # 当输入数据不标准的时候,对weights截取必要的一段
        attn_weights = attn_weights[:, : encoder_outputs.size()[1]]
        # attn_weights大小:batch_size, length_seq_of_encoder
        attn_weights = attn_weights.unsqueeze(1)
        # attn_weights大小:batch_size, 1, length_seq,中间的1是为了bmm乘法用的

        # 将attention的weights矩阵乘以encoder_outputs以计算注意力机制作用后的结果
        # encoder_outputs大小:batch_size, seq_length, hidden_size*direction
        attn_applied = torch.bmm(attn_weights, encoder_outputs)
        # attn_applied大小:batch_size, 1, hidden_size*direction
        # bmm:两个矩阵相乘。忽略第一个batch维度,缩并时间维度

        # 将输入的词向量与注意力机制作用后的结果拼接成一个大的输入向量
        output = torch.cat((embedded, attn_applied[:,0,:]), 1)
        # output大小:batch_size, hidden_size * (direction + 1)

        # 将大的输入向量映射为GRU的隐含层
        output = self.attn_combine(output).unsqueeze(1)
        # output大小:batch_size, length_seq, hidden_size
        output = F.relu(output)

        # output的结果再dropout
        output = self.dropout(output)

        # 开始解码器GRU的运算
        output, hidden = self.gru(output, hidden)

        # output大小:batch_size, length_seq, hidden_size * directions
        # hidden大小:n_layers * directions, batch_size, hidden_size

        # 取出GRU运算最后一步的结果,输入最后一层全连接层
        output = self.out(output[:, -1, :])
        # output大小:batch_size * output_size

        # 取logsoftmax,计算输出结果
        output = F.log_softmax(output, dim = 1)
        # output大小:batch_size * output_size
        return output, hidden, attn_weights

    def initHidden(self, batch_size):
        # 初始化解码器隐含单元,尺寸为n_layers * directions, batch_size, hidden_size
        result = torch.zeros(self.n_layers * 2, batch_size, self.hidden_size)
        if use_cuda:
            return result.cuda()
        else:
            return result

这段代码比较复杂,大家需要仔细阅读,特别是涉及各种张量尺寸的地方,注释特别指出了每一步运算的输入和输出张量的尺寸。另外,这段代码比正文中多了一层神经网络,这就是attn_combine层,用于将注意力权重与读取的编码器隐含单元内容的内积结果经过一个全连接网络的映射之后再输入解码器的隐含单元,这样处理会略微提升网络的效果。