调用方法:

with tf.variable_scope('t_attend_r'):
t_a_r = layers.block(t, r, r, Q_lengths=t_len, K_lengths=r_len)

with tf.variable_scope('r_attend_t'):
r_a_t = layers.block(r, t, t, Q_lengths=r_len, K_lengths=t_len)

其中

def block(
Q, K, V,
Q_lengths, K_lengths,
attention_type='dot',
is_layer_norm=True,
is_mask=True, mask_value=-2**32+1,
drop_prob=None):
'''Add a block unit from https://arxiv.org/pdf/1706.03762.pdf.
Args:
Q: [batch, Q_time, Q_dimension]
K: [batch, time, K_dimension]
V: [batch, time, V_dimension]

Q_length: [batch]
K_length: [batch]

Returns:
a tensor with shape [batch, Q_time, Q_dimension]

'''
att = attention(Q, K, V,
Q_lengths, K_lengths,
attention_type='dot',
is_mask=is_mask, mask_value=mask_value,
drop_prob=drop_prob)
if is_layer_norm:
with tf.variable_scope('attention_layer_norm'):
y = op.layer_norm_debug(Q + att)
else:
y = Q + att

z = FFN(y)
if is_layer_norm:
with tf.variable_scope('FFN_layer_norm'):
w = op.layer_norm_debug(y + z)
else:
w = y + z
return w


def attention(
Q, K, V,
Q_lengths, K_lengths,
attention_type='dot',
is_mask=True, mask_value=-2**32+1,
drop_prob=None):
'''Add attention layer.
Args:
Q: [batch, Q_time, Q_dimension]
K: [batch, time, K_dimension]
V: [batch, time, V_dimension]

Q_length: [batch]
K_length: [batch]

Returns:
a tensor with shape [batch, Q_time, Q_dimension]

'''
assert attention_type in ('dot', 'bilinear')
if attention_type == 'dot':
assert Q.shape[-1] == K.shape[-1]

Q_time = Q.shape[1]
K_time = K.shape[1]

if attention_type == 'dot':
logits = op.dot_sim(Q, K) #[batch, Q_time, time]
if attention_type == 'bilinear':
logits = op.bilinear_sim(Q, K)

if is_mask:
mask = op.mask(Q_lengths, K_lengths, Q_time, K_time) #[batch, Q_time, K_time]
logits = mask * logits + (1 - mask) * mask_value

attention = tf.nn.softmax(logits)

if drop_prob is not None:
print('use attention drop')
attention = tf.nn.dropout(attention, drop_prob)

return op.weighted_sum(attention, V)

def FFN(x, out_dimension_0=None, out_dimension_1=None):
'''Add two dense connected layer, max(0, x*W0+b0)*W1+b1.

Args:
x: [batch, time, dimension]
out_dimension: a number which is the output dimension

Returns:
[batch, time, out_dimension]

'''
with tf.variable_scope('FFN_1'):
y = op.dense(x, out_dimension_0)
y = tf.nn.relu(y)
with tf.variable_scope('FFN_2'):
z = op.dense(y, out_dimension_1) #, add_bias=False) #!!!!
return

输入是一个sequence和一个sequence的,
输出是[batch, sequence_len, hidden_dim]