Skip to content

Commit a86f344

Browse files
committed
fix bug(#64), update mask, fix typo
1 parent c5d0f74 commit a86f344

File tree

2 files changed

+40
-62
lines changed

2 files changed

+40
-62
lines changed

code/chapter10_natural-language-processing/10.12_machine-translation.ipynb

Lines changed: 31 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"name": "stdout",
1717
"output_type": "stream",
1818
"text": [
19-
"1.0.0 cpu\n"
19+
"1.2.0 cpu\n"
2020
]
2121
}
2222
],
@@ -52,9 +52,7 @@
5252
{
5353
"cell_type": "code",
5454
"execution_count": 2,
55-
"metadata": {
56-
"collapsed": true
57-
},
55+
"metadata": {},
5856
"outputs": [],
5957
"source": [
6058
"# 将一个序列中所有的词记录在all_tokens中以便之后构造词典,然后在该序列后面添加PAD直到序列\n",
@@ -75,9 +73,7 @@
7573
{
7674
"cell_type": "code",
7775
"execution_count": 3,
78-
"metadata": {
79-
"collapsed": true
80-
},
76+
"metadata": {},
8177
"outputs": [],
8278
"source": [
8379
"def read_data(max_seq_len):\n",
@@ -130,9 +126,7 @@
130126
{
131127
"cell_type": "code",
132128
"execution_count": 5,
133-
"metadata": {
134-
"collapsed": true
135-
},
129+
"metadata": {},
136130
"outputs": [],
137131
"source": [
138132
"class Encoder(nn.Module):\n",
@@ -183,9 +177,7 @@
183177
{
184178
"cell_type": "code",
185179
"execution_count": 7,
186-
"metadata": {
187-
"collapsed": true
188-
},
180+
"metadata": {},
189181
"outputs": [],
190182
"source": [
191183
"def attention_model(input_size, attention_size):\n",
@@ -198,9 +190,7 @@
198190
{
199191
"cell_type": "code",
200192
"execution_count": 8,
201-
"metadata": {
202-
"collapsed": true
203-
},
193+
"metadata": {},
204194
"outputs": [],
205195
"source": [
206196
"def attention_forward(model, enc_states, dec_state):\n",
@@ -250,9 +240,7 @@
250240
{
251241
"cell_type": "code",
252242
"execution_count": 10,
253-
"metadata": {
254-
"collapsed": true
255-
},
243+
"metadata": {},
256244
"outputs": [],
257245
"source": [
258246
"class Decoder(nn.Module):\n",
@@ -261,8 +249,9 @@
261249
" super(Decoder, self).__init__()\n",
262250
" self.embedding = nn.Embedding(vocab_size, embed_size)\n",
263251
" self.attention = attention_model(2*num_hiddens, attention_size)\n",
264-
" # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size\n",
265-
" self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob)\n",
252+
" # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size\n",
253+
" self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens, \n",
254+
" num_layers, dropout=drop_prob)\n",
266255
" self.out = nn.Linear(num_hiddens, vocab_size)\n",
267256
"\n",
268257
" def forward(self, cur_input, state, enc_states):\n",
@@ -272,8 +261,8 @@
272261
" \"\"\"\n",
273262
" # 使用注意力机制计算背景向量\n",
274263
" c = attention_forward(self.attention, enc_states, state[-1])\n",
275-
" # 将嵌入后的输入和背景向量在特征维连结\n",
276-
" input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size)\n",
264+
" # 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size)\n",
265+
" input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) \n",
277266
" # 为输入和背景向量的连结增加时间步维,时间步个数为1\n",
278267
" output, state = self.rnn(input_and_c.unsqueeze(0), state)\n",
279268
" # 移除时间步维,输出形状为(批量大小, 输出词典大小)\n",
@@ -295,9 +284,7 @@
295284
{
296285
"cell_type": "code",
297286
"execution_count": 11,
298-
"metadata": {
299-
"collapsed": true
300-
},
287+
"metadata": {},
301288
"outputs": [],
302289
"source": [
303290
"def batch_loss(encoder, decoder, X, Y, loss):\n",
@@ -308,25 +295,23 @@
308295
" dec_state = decoder.begin_state(enc_state)\n",
309296
" # 解码器在最初时间步的输入是BOS\n",
310297
" dec_input = torch.tensor([out_vocab.stoi[BOS]] * batch_size)\n",
311-
" # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失\n",
298+
" # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失, 初始全1\n",
312299
" mask, num_not_pad_tokens = torch.ones(batch_size,), 0\n",
313300
" l = torch.tensor([0.0])\n",
314301
" for y in Y.permute(1,0): # Y shape: (batch, seq_len)\n",
315302
" dec_output, dec_state = decoder(dec_input, dec_state, enc_outputs)\n",
316303
" l = l + (mask * loss(dec_output, y)).sum()\n",
317304
" dec_input = y # 使用强制教学\n",
318305
" num_not_pad_tokens += mask.sum().item()\n",
319-
" # 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误\n",
320-
" mask = mask * (y != out_vocab.stoi[PAD]).float()\n",
306+
" # EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0\n",
307+
" mask = mask * (y != out_vocab.stoi[EOS]).float()\n",
321308
" return l / num_not_pad_tokens"
322309
]
323310
},
324311
{
325312
"cell_type": "code",
326313
"execution_count": 12,
327-
"metadata": {
328-
"collapsed": true
329-
},
314+
"metadata": {},
330315
"outputs": [],
331316
"source": [
332317
"def train(encoder, decoder, dataset, lr, batch_size, num_epochs):\n",
@@ -358,11 +343,11 @@
358343
"name": "stdout",
359344
"output_type": "stream",
360345
"text": [
361-
"epoch 10, loss 0.441\n",
362-
"epoch 20, loss 0.183\n",
363-
"epoch 30, loss 0.100\n",
364-
"epoch 40, loss 0.046\n",
365-
"epoch 50, loss 0.025\n"
346+
"epoch 10, loss 0.475\n",
347+
"epoch 20, loss 0.245\n",
348+
"epoch 30, loss 0.157\n",
349+
"epoch 40, loss 0.052\n",
350+
"epoch 50, loss 0.039\n"
366351
]
367352
}
368353
],
@@ -386,9 +371,7 @@
386371
{
387372
"cell_type": "code",
388373
"execution_count": 14,
389-
"metadata": {
390-
"collapsed": true
391-
},
374+
"metadata": {},
392375
"outputs": [],
393376
"source": [
394377
"def translate(encoder, decoder, input_seq, max_seq_len):\n",
@@ -443,9 +426,7 @@
443426
{
444427
"cell_type": "code",
445428
"execution_count": 16,
446-
"metadata": {
447-
"collapsed": true
448-
},
429+
"metadata": {},
449430
"outputs": [],
450431
"source": [
451432
"def bleu(pred_tokens, label_tokens, k):\n",
@@ -466,9 +447,7 @@
466447
{
467448
"cell_type": "code",
468449
"execution_count": 17,
469-
"metadata": {
470-
"collapsed": true
471-
},
450+
"metadata": {},
472451
"outputs": [],
473452
"source": [
474453
"def score(input_seq, label_seq, k):\n",
@@ -504,29 +483,27 @@
504483
"name": "stdout",
505484
"output_type": "stream",
506485
"text": [
507-
"bleu 0.658, predict: they are russian .\n"
486+
"bleu 0.658, predict: they are exhausted .\n"
508487
]
509488
}
510489
],
511490
"source": [
512-
"score('ils sont canadiens .', 'they are canadian .', k=2)"
491+
"score('ils sont canadienne .', 'they are canadian .', k=2)"
513492
]
514493
},
515494
{
516495
"cell_type": "code",
517496
"execution_count": null,
518-
"metadata": {
519-
"collapsed": true
520-
},
497+
"metadata": {},
521498
"outputs": [],
522499
"source": []
523500
}
524501
],
525502
"metadata": {
526503
"kernelspec": {
527-
"display_name": "Python [conda env:anaconda3]",
504+
"display_name": "Python [conda env:py36]",
528505
"language": "python",
529-
"name": "conda-env-anaconda3-py"
506+
"name": "conda-env-py36-py"
530507
},
531508
"language_info": {
532509
"codemirror_mode": {
@@ -538,7 +515,7 @@
538515
"name": "python",
539516
"nbconvert_exporter": "python",
540517
"pygments_lexer": "ipython3",
541-
"version": "3.6.8"
518+
"version": "3.6.2"
542519
}
543520
},
544521
"nbformat": 4,

docs/chapter10_natural-language-processing/10.12_machine-translation.md

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,9 @@ class Decoder(nn.Module):
165165
super(Decoder, self).__init__()
166166
self.embedding = nn.Embedding(vocab_size, embed_size)
167167
self.attention = attention_model(2*num_hiddens, attention_size)
168-
# GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size
169-
self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob)
168+
# GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size
169+
self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens,
170+
num_layers, dropout=drop_prob)
170171
self.out = nn.Linear(num_hiddens, vocab_size)
171172

172173
def forward(self, cur_input, state, enc_states):
@@ -176,8 +177,8 @@ class Decoder(nn.Module):
176177
"""
177178
# 使用注意力机制计算背景向量
178179
c = attention_forward(self.attention, enc_states, state[-1])
179-
# 将嵌入后的输入和背景向量在特征维连结
180-
input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size)
180+
# 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size)
181+
input_and_c = torch.cat((self.embedding(cur_input), c), dim=1)
181182
# 为输入和背景向量的连结增加时间步维,时间步个数为1
182183
output, state = self.rnn(input_and_c.unsqueeze(0), state)
183184
# 移除时间步维,输出形状为(批量大小, 输出词典大小)
@@ -210,8 +211,8 @@ def batch_loss(encoder, decoder, X, Y, loss):
210211
l = l + (mask * loss(dec_output, y)).sum()
211212
dec_input = y # 使用强制教学
212213
num_not_pad_tokens += mask.sum().item()
213-
# 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误
214-
mask = mask * (y != out_vocab.stoi[PAD]).float()
214+
# EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0
215+
mask = mask * (y != out_vocab.stoi[EOS]).float()
215216
return l / num_not_pad_tokens
216217
```
217218

@@ -299,7 +300,7 @@ translate(encoder, decoder, input_seq, max_seq_len)
299300

300301
评价机器翻译结果通常使用BLEU(Bilingual Evaluation Understudy)[1]。对于模型预测序列中任意的子序列,BLEU考察这个子序列是否出现在标签序列中。
301302

302-
具体来说,设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子,假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$,预测序列为$A$、$B$、$B$、$C$、$D$,那么$p_1 = 4/5,\ p_2 = 3/4,\ p_3 = 1/3,\ p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数,那么,BLEU的定义为
303+
具体来说,设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子,假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$,预测序列为$A$、$B$、$B$、$C$、$D$,那么$p_1 = 4/5, p_2 = 3/4, p_3 = 1/3, p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数,那么,BLEU的定义为
303304

304305
$$ \exp\left(\min\left(0, 1 - \frac{len_{\text{label}}}{len_{\text{pred}}}\right)\right) \prod_{n=1}^k p_n^{1/2^n},$$
305306

@@ -348,7 +349,7 @@ bleu 1.000, predict: they are watching .
348349
测试一个不在训练集中的样本。
349350

350351
``` python
351-
score('ils sont canadiens .', 'they are canadian .', k=2)
352+
score('ils sont canadienne .', 'they are canadian .', k=2)
352353
```
353354
输出:
354355
```

0 commit comments

Comments
 (0)