from d2l import jax as d2l
import jax
from jax import numpy as jnp
from flax import linen as nn
import optax
import numpy as np
import osSentiment analysis — predict positive/negative from a movie review. Classic text-classification benchmark; the hello-world for applying deep learning to NLP.
This chapter uses IMDb: 25k positive + 25k negative movie reviews, balanced. The next two decks build an RNN and a CNN classifier on top of this pipeline; the deck after fine-tunes BERT on it.
This deck is just the data plumbing.
Each review is a separate text file. Walk the directory, one folder per label:
def read_imdb(data_dir, is_train):
"""Read the IMDb review dataset text sequences and labels."""
data, labels = [], []
for label in ('pos', 'neg'):
folder_name = os.path.join(data_dir, 'train' if is_train else 'test',
label)
for file in os.listdir(folder_name):
with open(os.path.join(folder_name, file), 'rb') as f:
review = f.read().decode('utf-8').replace('\n', '')
data.append(review)
labels.append(1 if label == 'pos' else 0)
return data, labels
train_data = read_imdb(data_dir, is_train=True)
print('# training examples:', len(train_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
print('label:', y, 'review:', x[:60])# training examples: 25000
label: 1 review: Zentropa has much in common with The Third Man, another noir
label: 1 review: Zentropa is the most original movie I've seen in years. If y
label: 1 review: Lars Von Trier is never backward in trying out new technique
Tokenize words, build a vocabulary with frequency cutoff, truncate or pad each review to a fixed length:
X: (64, 500) , y: (64,)
# batches: 390
def load_data_imdb(batch_size, num_steps=500):
"""Return data iterators and the vocabulary of the IMDb review dataset."""
data_dir = d2l.download_extract('aclImdb', 'aclImdb')
train_data = read_imdb(data_dir, True)
test_data = read_imdb(data_dir, False)
train_tokens = d2l.tokenize(train_data[0], token='word')
test_tokens = d2l.tokenize(test_data[0], token='word')
vocab = d2l.Vocab(train_tokens, min_freq=5)
train_features = jnp.array([d2l.truncate_pad(
vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
test_features = jnp.array([d2l.truncate_pad(
vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])
train_iter = d2l.load_array((train_features, jnp.array(train_data[1])),
batch_size)
test_iter = d2l.load_array((test_features, jnp.array(test_data[1])),
batch_size,
is_train=False)
return train_iter, test_iter, vocab(token_ids, label) minibatches, ready for the RNN, CNN, and BERT classifiers in the next decks.