Reading IMDb

Sentiment Analysis and the Dataset

Sentiment Analysis Data

Sentiment analysis — predict positive/negative from a movie review. Classic text-classification benchmark; the hello-world for applying deep learning to NLP.

This chapter uses IMDb: 25k positive + 25k negative movie reviews, balanced. The next two decks build an RNN and a CNN classifier on top of this pipeline; the deck after fine-tunes BERT on it.

This deck is just the data plumbing.

from d2l import jax as d2l
import jax
from jax import numpy as jnp
from flax import linen as nn
import optax
import numpy as np
import os

Each review is a separate text file. Walk the directory, one folder per label:

d2l.DATA_HUB['aclImdb'] = (d2l.DATA_URL + 'aclImdb_v1.tar.gz', 
                          '01ada507287d82875905620988597833ad4e0903')

data_dir = d2l.download_extract('aclImdb', 'aclImdb')
def read_imdb(data_dir, is_train):
    """Read the IMDb review dataset text sequences and labels."""
    data, labels = [], []
    for label in ('pos', 'neg'):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test',
                                   label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels

train_data = read_imdb(data_dir, is_train=True)
print('# training examples:', len(train_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('label:', y, 'review:', x[:60])
# training examples: 25000
label: 1 review: Zentropa has much in common with The Third Man, another noir
label: 1 review: Zentropa is the most original movie I've seen in years. If y
label: 1 review: Lars Von Trier is never backward in trying out new technique

Preprocessing

Tokenize words, build a vocabulary with frequency cutoff, truncate or pad each review to a fixed length:

train_tokens = d2l.tokenize(train_data[0], token='word')
vocab = d2l.Vocab(train_tokens, min_freq=5, reserved_tokens=['<pad>'])
d2l.set_figsize()
d2l.plt.xlabel('# tokens per review')
d2l.plt.ylabel('count')
d2l.plt.hist([len(line) for line in train_tokens], bins=range(0, 1000, 50));

num_steps = 500  # sequence length
train_features = d2l.tensor([d2l.truncate_pad(
    vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
print(train_features.shape)
(25000, 500)

Data loaders

train_iter = d2l.load_array((train_features, jnp.array(train_data[1])), 64)

for X, y in train_iter:
    print('X:', X.shape, ', y:', y.shape)
    break
print('# batches:', len(train_iter))
X: (64, 500) , y: (64,)
# batches: 390
def load_data_imdb(batch_size, num_steps=500):
    """Return data iterators and the vocabulary of the IMDb review dataset."""
    data_dir = d2l.download_extract('aclImdb', 'aclImdb')
    train_data = read_imdb(data_dir, True)
    test_data = read_imdb(data_dir, False)
    train_tokens = d2l.tokenize(train_data[0], token='word')
    test_tokens = d2l.tokenize(test_data[0], token='word')
    vocab = d2l.Vocab(train_tokens, min_freq=5)
    train_features = jnp.array([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
    test_features = jnp.array([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])
    train_iter = d2l.load_array((train_features, jnp.array(train_data[1])),
                                batch_size)
    test_iter = d2l.load_array((test_features, jnp.array(test_data[1])),
                               batch_size,
                               is_train=False)
    return train_iter, test_iter, vocab

Recap

  • IMDb: 25k+25k movie reviews, binary sentiment.
  • Standard preprocessing: tokenize, build vocab, truncate/pad, batch.
  • Output: (token_ids, label) minibatches, ready for the RNN, CNN, and BERT classifiers in the next decks.