import collections
from d2l import tensorflow as d2l
import math
import tensorflow as tfTwo RNNs glued together (Sutskever et al., 2014; Cho et al., 2014):
<eos>.The decoder is just a conditional language model.
Seq2seq with RNN encoder and decoder. <eos> ends the sequence; <bos> starts decoding.
The single-vector context is a bottleneck — motivates attention in the next chapter.
Decoder input: <bos>, “Ils”, “regardent”, “.”. Decoder label: “Ils”, “regardent”, “.”, <eos>.
The MTFraEng pipeline already produces this shifted pair. Same self-supervised setup as a language model — only now the encoder output is concatenated as extra context.
Alternative — scheduled sampling: occasionally feed the prediction back. More realistic at inference but harder to optimize.
Embedding layer for input tokens, then a multilayer GRU. Output: per-step hidden states (top layer) and final state (all layers):
class Seq2SeqEncoder(d2l.Encoder):
"""The RNN encoder for sequence-to-sequence learning."""
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0):
super().__init__()
self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size)
self.rnn = d2l.GRU(num_hiddens, num_layers, dropout)
def call(self, X, *args):
# X shape: (batch_size, num_steps)
embs = self.embedding(d2l.transpose(X))
# embs shape: (num_steps, batch_size, embed_size)
outputs, state = self.rnn(embs)
# outputs shape: (num_steps, batch_size, num_hiddens)
# state shape: (num_layers, batch_size, num_hiddens)
return outputs, stateTwo-layer GRU, hidden 16, batch 4, seq length 9. Confirm shapes:
vocab_size, embed_size, num_hiddens, num_layers = 10, 8, 16, 2
batch_size, num_steps = 4, 9
encoder = Seq2SeqEncoder(vocab_size, embed_size, num_hiddens, num_layers)
X = d2l.zeros((batch_size, num_steps))
enc_outputs, enc_state = encoder(X)
d2l.check_shape(enc_outputs, (num_steps, batch_size, num_hiddens))Embed the previous target token, concatenate the encoder’s final hidden state at every decoder time step (context broadcast across the sequence), run a GRU, and project to vocab logits:
class Seq2SeqDecoder(d2l.Decoder):
"""The RNN decoder for sequence to sequence learning."""
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0):
super().__init__()
self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size)
self.rnn = d2l.GRU(num_hiddens, num_layers, dropout)
self.dense = tf.keras.layers.Dense(vocab_size)
def init_state(self, enc_all_outputs, *args):
return enc_all_outputs
def call(self, X, state):
# X shape: (batch_size, num_steps)
# embs shape: (num_steps, batch_size, embed_size)
embs = self.embedding(d2l.transpose(X))
enc_output, hidden_state = state
# context shape: (batch_size, num_hiddens)
context = enc_output[-1]
# Broadcast context to (num_steps, batch_size, num_hiddens)
context = tf.tile(tf.expand_dims(context, 0), (embs.shape[0], 1, 1))
# Concat at the feature dimension
embs_and_context = d2l.concat((embs, context), -1)
outputs, hidden_state = self.rnn(embs_and_context, hidden_state)
outputs = d2l.transpose(self.dense(outputs), (1, 0, 2))
# outputs shape: (batch_size, num_steps, vocab_size)
# hidden_state shape: (num_layers, batch_size, num_hiddens)
return outputs, [enc_output, hidden_state]End-to-end forward pass produces (batch, num_steps, vocab) logits and a state of shape (num_layers, batch, num_hiddens):
decoder = Seq2SeqDecoder(vocab_size, embed_size, num_hiddens, num_layers)
state = decoder.init_state(encoder(X))
dec_outputs, state = decoder(X, state)
d2l.check_shape(dec_outputs, (batch_size, num_steps, vocab_size))
d2l.check_len(state[1], num_layers)
d2l.check_shape(state[1][0], (batch_size, num_hiddens))Subclass EncoderDecoder, add the optimizer:
Layers of the RNN encoder–decoder: embedding → encoder GRU → decoder GRU (with broadcast context) → dense.
class Seq2Seq(d2l.EncoderDecoder):
"""The RNN encoder--decoder for sequence to sequence learning."""
def __init__(self, encoder, decoder, tgt_pad, lr):
super().__init__(encoder, decoder)
self.save_hyperparameters()
def validation_step(self, batch):
Y_hat = self(*batch[:-1])
self.plot('loss', self.loss(Y_hat, batch[-1]), train=False)
def configure_optimizers(self):
# Adam optimizer is used here
return tf.keras.optimizers.Adam(learning_rate=self.lr)<pad> predictions shouldn’t contribute to the loss. Build a mask Y != tgt_pad and average only over real tokens:
\mathcal{L} = \frac{\sum_{b,t} \mathbf{1}\{y_{b,t} \ne \texttt{<pad>}\} \, \ell(\hat{\mathbf{y}}_{b,t}, y_{b,t})} {\sum_{b,t} \mathbf{1}\{y_{b,t} \ne \texttt{<pad>}\}}.
2-layer GRU, embed/hidden 256, dropout 0.2, Adam lr=0.005, gradient clip 1, 30 epochs:
data = d2l.MTFraEng(batch_size=128)
embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2
with d2l.try_gpu():
encoder = Seq2SeqEncoder(
len(data.src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder(
len(data.tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = Seq2Seq(encoder, decoder, tgt_pad=data.tgt_vocab['<pad>'],
lr=0.005)
trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1)
trainer.fit(model, data)Run the encoder once, then loop: feed the previous prediction back, take argmax over the vocab. Stop after num_steps (or when <eos> appears — handled by the caller).
Predicting token by token: feed the previous prediction back, stop on <eos>.
@d2l.add_to_class(d2l.EncoderDecoder)
def predict_step(self, batch, device, num_steps,
save_attention_weights=False):
src, tgt, src_valid_len, _ = batch
enc_all_outputs = self.encoder(src, src_valid_len, training=False)
dec_state = self.decoder.init_state(enc_all_outputs, src_valid_len)
outputs, attention_weights = [d2l.expand_dims(tgt[:, 0], 1), ], []
for _ in range(num_steps):
Y, dec_state = self.decoder(outputs[-1], dec_state, training=False)
outputs.append(d2l.argmax(Y, 2))
# Save attention weights (to be covered later)
if save_attention_weights:
attention_weights.append(self.decoder.attention_weights)
return d2l.concat(outputs[1:], 1), attention_weightsCompare prediction n-grams against reference. Geometric mean of n-gram precisions, with a brevity penalty so the model can’t game it by emitting “the the”.
\text{BLEU} = \exp\!\left(\min\!\left(0, 1 - \frac{\text{len}_{\text{label}}}{\text{len}_{\text{pred}}}\right)\right) \prod_{n=1}^k p_n^{1/2^n}.
def bleu(pred_seq, label_seq, k):
"""Compute the BLEU."""
pred_tokens, label_tokens = pred_seq.split(' '), label_seq.split(' ')
len_pred, len_label = len(pred_tokens), len(label_tokens)
score = math.exp(min(0, 1 - len_label / len_pred))
for n in range(1, min(k, len_pred) + 1):
num_matches, label_subs = 0, collections.defaultdict(int)
for i in range(len_label - n + 1):
label_subs[' '.join(label_tokens[i: i + n])] += 1
for i in range(len_pred - n + 1):
if label_subs[' '.join(pred_tokens[i: i + n])] > 0:
num_matches += 1
label_subs[' '.join(pred_tokens[i: i + n])] -= 1
score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n))
return scoreRun the model on a handful of English sentences and score each. Short, frequent patterns tend to translate cleanly; low BLEU on a sentence usually means one missing or misplaced token is enough to break an n-gram match:
engs = ['i lost .', 'i\'m calm .', 'i\'m home .']
fras = ['j\'ai perdu .', 'je suis calme .', 'je suis chez moi .']
preds, _ = model.predict_step(
data.build(engs, fras), d2l.try_gpu(), data.num_steps)
for en, fr, p in zip(engs, fras, preds):
translation = []
for token in data.tgt_vocab.to_tokens(p):
if token == '<eos>':
break
translation.append(token)
print(f'{en} => {translation}, bleu,'
f'{bleu(" ".join(translation), fr, k=2):.3f}')i lost . => ["j'ai", 'perdu', '.'], bleu,1.000
i'm calm . => ['je', 'suis', 'calme', '.'], bleu,1.000
i'm home . => ["j'en", 'suis', 'sûr', '.'], bleu,0.000
<pad> from the loss.