### 机器翻译

• 现成工具：沙拉查词
• 机器翻译原理
• 最佳翻译
• 会意会的机器翻译：你有算法，我有意会

## 机器翻译原理

• ：条件概率，表示在 X 条件下 Y 发生的概率
• ：条件 X、事件 Y 同时发生的概率
• ：条件 X 发生的概率

## 最佳翻译

• 1 March 2001
• 2001-03-01

``````# nmt_utils.py
import numpy as np
from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt

fake = Faker()
Faker.seed(12345)
random.seed(12345)

FORMATS = ['short',
'medium',
'long',
'full',
'full',
'full',
'full',
'full',
'full',
'full',
'full',
'full',
'full',
'd MMM YYY',
'd MMMM YYY',
'dd MMM YYY',
'd MMM, YYY',
'd MMMM, YYY',
'dd, MMM YYY',
'd MM YY',
'd MMMM YYY',
'MMMM d YYY',
'MMMM d, YYY',
'dd.MM.YY']

LOCALES = ['en_US']

dt = fake.date_object()

try:

except AttributeError as e:
return None, None, None

human_vocab = set()
machine_vocab = set()
dataset = []
Tx = 30

for i in tqdm(range(m)):
if h is not None:
dataset.append((h, m))
human_vocab.update(tuple(h))
machine_vocab.update(tuple(m))

human = dict(zip(sorted(human_vocab) + ['<unk>', '<pad>'],
list(range(len(human_vocab) + 2))))
inv_machine = dict(enumerate(sorted(machine_vocab)))
machine = {v:k for k,v in inv_machine.items()}

return dataset, human, machine, inv_machine

def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):

X, Y = zip(*dataset)

X = np.array([string_to_int(i, Tx, human_vocab) for i in X])
Y = [string_to_int(t, Ty, machine_vocab) for t in Y]

Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))

return X, np.array(Y), Xoh, Yoh

def string_to_int(string, length, vocab):
string = string.lower()
string = string.replace(',','')

if len(string) > length:
string = string[:length]

rep = list(map(lambda x: vocab.get(x, '<unk>'), string))

if len(string) < length:
rep += [vocab['<pad>']] * (length - len(string))

return rep

def int_to_string(ints, inv_vocab):
l = [inv_vocab[i] for i in ints]
return l

EXAMPLES = ['3 May 1979', '5 Apr 09', '20th February 2016', 'Wed 10 Jul 2007']

def run_example(model, input_vocabulary, inv_output_vocabulary, text):
encoded = string_to_int(text, TIME_STEPS, input_vocabulary)
prediction = model.predict(np.array([encoded]))
prediction = np.argmax(prediction[0], axis=-1)
return int_to_string(prediction, inv_output_vocabulary)

def run_examples(model, input_vocabulary, inv_output_vocabulary, examples=EXAMPLES):
predicted = []
for example in examples:
predicted.append(''.join(run_example(model, input_vocabulary, inv_output_vocabulary, example)))
print('input:', example)
print('output:', predicted[-1])
return predicted

def softmax(x, axis=1):
ndim = K.ndim(x)
if ndim == 2:
return K.softmax(x)
elif ndim > 2:
e = K.exp(x - K.max(x, axis=axis, keepdims=True))
s = K.sum(e, axis=axis, keepdims=True)
return e / s
else:
raise ValueError('Cannot apply softmax to a tensor that is 1D')

def plot_attention_map(model, input_vocabulary, inv_output_vocabulary, text, n_s = 128, num = 6, Tx = 30, Ty = 10):
attention_map = np.zeros((10, 30))
Ty, Tx = attention_map.shape

s0 = np.zeros((1, n_s))
c0 = np.zeros((1, n_s))
layer = model.layers[num]

encoded = np.array(string_to_int(text, Tx, input_vocabulary)).reshape((1, 30))
encoded = np.array(list(map(lambda x: to_categorical(x, num_classes=len(input_vocabulary)), encoded)))

f = K.function(model.inputs, [layer.get_output_at(t) for t in range(Ty)])
r = f([encoded, s0, c0])

for t in range(Ty):
for t_prime in range(Tx):
attention_map[t][t_prime] = r[t][0,t_prime,0]

prediction = model.predict([encoded, s0, c0])

predicted_text = []
for i in range(len(prediction)):
predicted_text.append(int(np.argmax(prediction[i], axis=1)))

predicted_text = list(predicted_text)
predicted_text = int_to_string(predicted_text, inv_output_vocabulary)
text_ = list(text)

input_length = len(text)
output_length = Ty

plt.clf()
f = plt.figure(figsize=(8, 8.5))

i = ax.imshow(attention_map, interpolation='nearest', cmap='Blues')

cbaxes = f.add_axes([0.2, 0, 0.6, 0.03])
cbar = f.colorbar(i, cax=cbaxes, orientation='horizontal')
cbar.ax.set_xlabel('Alpha value (Probability output of the "softmax")', labelpad=2)

ax.set_yticks(range(output_length))
ax.set_yticklabels(predicted_text[:output_length])

ax.set_xticks(range(input_length))
ax.set_xticklabels(text_[:input_length], rotation=45)

ax.set_xlabel('Input Sequence')
ax.set_ylabel('Output Sequence')

ax.grid()

return attention_map``````
``````from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
import numpy as np

from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
from nmt_utils import *
import matplotlib.pyplot as plt

m = 10000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)

Tx = 30
Ty = 10
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

index = 0

repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor = Dense(1, activation = "relu")
activator = Activation(softmax, name='attention_weights')
dotor = Dot(axes = 1)

def one_step_attention(a, s_prev):
s_prev = repeator(s_prev)
concat = concatenator([a, s_prev])
e = densor(concat)
alphas = activator(e)
context = dotor([alphas, a])
return context

n_a = 64
n_s = 128
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(len(machine_vocab), activation=softmax)

def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
X = Input(shape=(Tx, human_vocab_size))
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
outputs = []
a = Bidirectional(LSTM(n_a, return_sequences=True))(X)

for t in range(Ty):
context = one_step_attention(a, s)
s, _, c = post_activation_LSTM_cell(context, initial_state = [s, c])
out = output_layer(s)
outputs.append(out)

model = Model(inputs = [X, s0, c0], outputs = outputs)
return model

model = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))
model.summary()

out = model.compile(optimizer=Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01), metrics=['accuracy'], loss='categorical_crossentropy')

s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(Yoh.swapaxes(0,1))
model.fit([Xoh, s0, c0], outputs, epochs=10, batch_size=100)

s1 = np.zeros((1, n_s))
c1 = np.zeros((1, n_s))

EXAMPLES = ['March 3rd 2001', '1 March 2001']
for example in EXAMPLES:
source = string_to_int(example, Tx, human_vocab)
source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source)))
source = np.expand_dims(source, axis=0)
prediction = model([source, s1, c1])
prediction = np.argmax(prediction, axis = -1)
output = [inv_machine_vocab[int(i)] for i in prediction]

print("source:", example)
print("output:", ''.join(output))``````

``````source: March 3rd 2001
output: 2001-03-03

source: 1 March 2001
output: 2001-03-01``````

### 会意会的机器翻译：你有算法，我有意会

• A：用词典或者机器，翻译原文
• B：直译赶来的中文句子
• C：重构B，把直译改成意译，用中文习惯的说法
• C -> A：对比原文与意译，记录各种不同
• 扩展练习：把学到的知识迁移到其他场景中
• 五次复习法：巩固学习成果

e.g. 阅读的时候读到这么一段原文：

I 'm sorry about not getting back to you sooner.

I couldn’t access the internet from my laptop for some reason.

A：在微信里直译得到 B

• 很抱歉，没有早点回复你
• 因为某些原因，我无法从我的笔记本上上网

C：用中文习惯的说法，把直译改成意译

• 不好意思，这么晚才回复你
• 不知道怎么回事，我的笔记本连不上网

C -> A：详细分析，得到这俩个句子之间的关联、替换

• 这么晚：不是用 so late，而是 not…sooner
• 连上网：不是用 connect to，而是 access
• 不知道怎么回事：不是用 don’t konw why 或 for no reason，而是 for some reason

• 我的笔记本连不上网：主语不是笔记本，而是 I couldn’t access the internet from my laptop. 主语 I

• 不好意思，这么晚才回复你。替换为 不好意思，拖了这么久才跟你说这件事。
• I 'm sorry about not getting back to you sooner. 替换为 I 'm sorry about not telling you this sooner.
• 不知道怎么回事，我的笔记本连不上网。替换为 不知道怎么回事，我的手机上不了微信。
• I couldn’t access the internet from my laptop for some reason. 替换为 I couldn’t access Wechat from my phone for some reason.