Downloading the Tatoeba corpus

Machine Translation and the Dataset

Machine Translation Data

Language modeling predicts one sequence. Translation maps between sequences — different lengths, different word orders. This unaligned source-to-target structure is the sequence-to-sequence (seq2seq) setting that drives the rest of this chapter and most of the attention chapter.

This deck builds the data plumbing for English→French:

  • download a parallel corpus from Tatoeba,
  • normalize and word-tokenize both sides,
  • build separate source / target vocabularies,
  • pad and truncate to fixed length, with <bos> / <eos> / <pad> / <unk> special tokens.
from d2l import mxnet as d2l
from mxnet import np, npx
import os
npx.set_np()

Tab-separated bilingual sentence pairs — one English, one French per line:

class MTFraEng(d2l.DataModule):
    """The English-French dataset."""
    def _download(self):
        d2l.extract(d2l.download(
            d2l.DATA_URL+'fra-eng.zip', self.root, 
            '94646ad1522d915e7b0f9296181140edcf86a4f5'))
        with open(self.root + '/fra-eng/fra.txt', encoding='utf-8') as f:
            return f.read()
data = MTFraEng() 
raw_text = data._download()
print(raw_text[:75])

Preprocessing

Lower-case, replace non-breaking spaces, insert a space before punctuation so ,.!? become their own tokens:

@d2l.add_to_class(MTFraEng)
def _preprocess(self, text):
    # Replace non-breaking space with space
    text = text.replace('\u202f', ' ').replace('\xa0', ' ')
    # Insert space between words and punctuation marks
    no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' '
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
           for i, char in enumerate(text.lower())]
    return ''.join(out)
text = data._preprocess(raw_text)
print(text[:80])

Word-level tokenization

Word tokens (modern systems use BPE / WordPiece). Append <eos> so the model knows when to stop generating.

@d2l.add_to_class(MTFraEng)
def _tokenize(self, text, max_examples=None):
    src, tgt = [], []
    for i, line in enumerate(text.split('\n')):
        if max_examples and i >= max_examples: break
        parts = line.split('\t')
        if len(parts) == 2:
            # Skip empty tokens
            src.append([t for t in f'{parts[0]} <eos>'.split(' ') if t])
            tgt.append([t for t in f'{parts[1]} <eos>'.split(' ') if t])
    return src, tgt

Two parallel lists: src[i] and tgt[i] are the i-th English sentence and its French translation:

src, tgt = data._tokenize(text)
src[:6], tgt[:6]

Length distribution

Most sentences are short — under 20 tokens. That justifies a small fixed num_steps and pad/truncate strategy.

def show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist):
    """Plot the histogram for list length pairs."""
    d2l.set_figsize()
    _, _, patches = d2l.plt.hist(
        [[len(l) for l in xlist], [len(l) for l in ylist]])
    d2l.plt.xlabel(xlabel)
    d2l.plt.ylabel(ylabel)
    for patch in patches[1].patches:
        patch.set_hatch('/')
    d2l.plt.legend(legend)

Padding to fixed length

Truncate long sequences, pad short ones with <pad>. Track valid_len (real tokens, no padding) — the model needs it to mask attention/loss later. Target sequences get a <bos> prefix; the label is the target shifted by one.

@d2l.add_to_class(MTFraEng)
def __init__(self, batch_size, num_steps=9, num_train=512, num_val=128):
    super(MTFraEng, self).__init__()
    self.save_hyperparameters()
    self.arrays, self.src_vocab, self.tgt_vocab = self._build_arrays(
        self._download())
@d2l.add_to_class(MTFraEng)
def _build_arrays(self, raw_text, src_vocab=None, tgt_vocab=None):
    def _build_array(sentences, vocab, is_tgt=False):
        pad_or_trim = lambda seq, t: (
            seq[:t-1] + ['<eos>'] if len(seq) > t else seq + ['<pad>'] * (t - len(seq)))
        sentences = [pad_or_trim(s, self.num_steps) for s in sentences]
        if is_tgt:
            sentences = [['<bos>'] + s for s in sentences]
        if vocab is None:
            vocab = d2l.Vocab(sentences, min_freq=2)
        array = d2l.tensor([vocab[s] for s in sentences])
        valid_len = d2l.reduce_sum(
            d2l.astype(array != vocab['<pad>'], d2l.int32), 1)
        return array, vocab, valid_len
    src, tgt = self._tokenize(self._preprocess(raw_text), 
                              self.num_train + self.num_val)
    src_array, src_vocab, src_valid_len = _build_array(src, src_vocab)
    tgt_array, tgt_vocab, _ = _build_array(tgt, tgt_vocab, True)
    return ((src_array, tgt_array[:,:-1], src_valid_len, tgt_array[:,1:]),
            src_vocab, tgt_vocab)

Dataloader

Standard split — first num_train examples for training, the rest for validation:

@d2l.add_to_class(MTFraEng)
def get_dataloader(self, train):
    idx = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader(self.arrays, train, idx)

A minibatch end to end

Source IDs, decoder input (target shifted right with <bos>), valid length, label (target shifted left):

data = MTFraEng(batch_size=3)
src, tgt, src_valid_len, label = next(iter(data.train_dataloader()))
print('source:', d2l.astype(src, d2l.int32))
print('decoder input:', d2l.astype(tgt, d2l.int32))
print('source len excluding pad:', d2l.astype(src_valid_len, d2l.int32))
print('label:', d2l.astype(label, d2l.int32))

Convert IDs back to tokens for inspection:

@d2l.add_to_class(MTFraEng)
def build(self, src_sentences, tgt_sentences):
    raw_text = '\n'.join([src + '\t' + tgt for src, tgt in zip(
        src_sentences, tgt_sentences)])
    arrays, _, _ = self._build_arrays(
        raw_text, self.src_vocab, self.tgt_vocab)
    return arrays
src, tgt, _,  _ = data.build(['hi .'], ['salut .'])
print('source:', data.src_vocab.to_tokens(d2l.astype(src[0], d2l.int32)))
print('target:', data.tgt_vocab.to_tokens(d2l.astype(tgt[0], d2l.int32)))

Recap

  • Translation = pairs of unaligned sequences (different lengths, possibly different word orders).
  • Word-level tokenization → much larger vocab than character; treat rare tokens as <unk> to keep it manageable.
  • Pad/truncate to fixed num_steps for batching; record valid_len to mask later.
  • Decoder input is target with <bos> prefix; label is target shifted left by one — that’s teacher forcing setup.