from d2l import torch as d2l
import torch
import osLanguage modeling predicts one sequence. Translation maps between sequences — different lengths, different word orders. This unaligned source-to-target structure is the sequence-to-sequence (seq2seq) setting that drives the rest of this chapter and most of the attention chapter.
This deck builds the data plumbing for English→French:
<bos> / <eos> / <pad> / <unk> special tokens.Tab-separated bilingual sentence pairs — one English, one French per line:
Lower-case, replace non-breaking spaces, insert a space before punctuation so ,.!? become their own tokens:
@d2l.add_to_class(MTFraEng)
def _preprocess(self, text):
# Replace non-breaking space with space
text = text.replace('\u202f', ' ').replace('\xa0', ' ')
# Insert space between words and punctuation marks
no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' '
out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
for i, char in enumerate(text.lower())]
return ''.join(out)Word tokens (modern systems use BPE / WordPiece). Append <eos> so the model knows when to stop generating.
@d2l.add_to_class(MTFraEng)
def _tokenize(self, text, max_examples=None):
src, tgt = [], []
for i, line in enumerate(text.split('\n')):
if max_examples and i >= max_examples: break
parts = line.split('\t')
if len(parts) == 2:
# Skip empty tokens
src.append([t for t in f'{parts[0]} <eos>'.split(' ') if t])
tgt.append([t for t in f'{parts[1]} <eos>'.split(' ') if t])
return src, tgtTwo parallel lists: src[i] and tgt[i] are the i-th English sentence and its French translation:
([['go', '.', '<eos>'],
['hi', '.', '<eos>'],
['run', '!', '<eos>'],
['run', '!', '<eos>'],
['who', '?', '<eos>'],
['wow', '!', '<eos>']],
[['va', '!', '<eos>'],
['salut', '!', '<eos>'],
['cours', '!', '<eos>'],
['courez', '!', '<eos>'],
['qui', '?', '<eos>'],
['ça', 'alors', '!', '<eos>']])
Most sentences are short — under 20 tokens. That justifies a small fixed num_steps and pad/truncate strategy.
def show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist):
"""Plot the histogram for list length pairs."""
d2l.set_figsize()
_, _, patches = d2l.plt.hist(
[[len(l) for l in xlist], [len(l) for l in ylist]])
d2l.plt.xlabel(xlabel)
d2l.plt.ylabel(ylabel)
for patch in patches[1].patches:
patch.set_hatch('/')
d2l.plt.legend(legend)Truncate long sequences, pad short ones with <pad>. Track valid_len (real tokens, no padding) — the model needs it to mask attention/loss later. Target sequences get a <bos> prefix; the label is the target shifted by one.
@d2l.add_to_class(MTFraEng)
def _build_arrays(self, raw_text, src_vocab=None, tgt_vocab=None):
def _build_array(sentences, vocab, is_tgt=False):
pad_or_trim = lambda seq, t: (
seq[:t-1] + ['<eos>'] if len(seq) > t else seq + ['<pad>'] * (t - len(seq)))
sentences = [pad_or_trim(s, self.num_steps) for s in sentences]
if is_tgt:
sentences = [['<bos>'] + s for s in sentences]
if vocab is None:
vocab = d2l.Vocab(sentences, min_freq=2)
array = d2l.tensor([vocab[s] for s in sentences])
valid_len = d2l.reduce_sum(
d2l.astype(array != vocab['<pad>'], d2l.int32), 1)
return array, vocab, valid_len
src, tgt = self._tokenize(self._preprocess(raw_text),
self.num_train + self.num_val)
src_array, src_vocab, src_valid_len = _build_array(src, src_vocab)
tgt_array, tgt_vocab, _ = _build_array(tgt, tgt_vocab, True)
return ((src_array, tgt_array[:,:-1], src_valid_len, tgt_array[:,1:]),
src_vocab, tgt_vocab)Standard split — first num_train examples for training, the rest for validation:
Source IDs, decoder input (target shifted right with <bos>), valid length, label (target shifted left):
data = MTFraEng(batch_size=3)
src, tgt, src_valid_len, label = next(iter(data.train_dataloader()))
print('source:', d2l.astype(src, d2l.int32))
print('decoder input:', d2l.astype(tgt, d2l.int32))
print('source len excluding pad:', d2l.astype(src_valid_len, d2l.int32))
print('label:', d2l.astype(label, d2l.int32))source: tensor([[14, 26, 3, 2, 1, 1, 1, 1, 1],
[25, 33, 5, 2, 1, 1, 1, 1, 1],
[65, 13, 5, 2, 1, 1, 1, 1, 1]], dtype=torch.int32)
decoder input: tensor([[ 2, 22, 0, 5, 3, 1, 1, 1, 1],
[ 2, 14, 116, 33, 74, 5, 3, 1, 1],
[ 2, 30, 0, 5, 3, 1, 1, 1, 1]], dtype=torch.int32)
source len excluding pad: tensor([4, 4, 4], dtype=torch.int32)
label: tensor([[ 22, 0, 5, 3, 1, 1, 1, 1, 1],
[ 14, 116, 33, 74, 5, 3, 1, 1, 1],
[ 30, 0, 5, 3, 1, 1, 1, 1, 1]], dtype=torch.int32)
Convert IDs back to tokens for inspection:
source: ['hi', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
target: ['<bos>', 'salut', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
<unk> to keep it manageable.num_steps for batching; record valid_len to mask later.<bos> prefix; label is target shifted left by one — that’s teacher forcing setup.