Natural Language Inference: Using Attention

Decomposable Attention

Decomposable Attention (Parikh et al., 2016) — a small, fast NLI model that beat much more complex recurrence-based architectures on SNLI in 2016. No recurrence, no convolution — pure attention + MLPs.

Three steps: Attend → Compare → Aggregate.

Pipeline

GloVe → attend → compare → aggregate → 3-way classifier.

The decomposable attention model

Align premise/hypothesis tokens, then compare and aggregate.

Setup

from d2l import jax as d2l
import jax
from jax import numpy as jnp
from flax import nnx
import optax
import numpy as np

Step 1: Attend

Compute alignment weights between every premise word and every hypothesis word. Use them to build aligned context vectors:

class MLP(nnx.Module):
    def __init__(self, num_inputs, num_hiddens, flatten, rngs=None):
        rngs = nnx.Rngs(params=0, dropout=1) if rngs is None else rngs
        self.dropout1 = nnx.Dropout(0.2, rngs=rngs)
        self.dense1 = nnx.Linear(num_inputs, num_hiddens, rngs=rngs)
        self.dropout2 = nnx.Dropout(0.2, rngs=rngs)
        self.dense2 = nnx.Linear(num_hiddens, num_hiddens, rngs=rngs)
        self.flatten = flatten

    def __call__(self, x):
        x = nnx.relu(self.dense1(self.dropout1(x)))
        if self.flatten:
            x = x.reshape((x.shape[0], -1))
        x = nnx.relu(self.dense2(self.dropout2(x)))
        if self.flatten:
            x = x.reshape((x.shape[0], -1))
        return x

class Attend(nnx.Module):
    def __init__(self, embed_size, num_hiddens, rngs=None):
        self.f = MLP(embed_size, num_hiddens, flatten=False, rngs=rngs)

    def __call__(self, A, B):
        # Shape of `A`/`B`: (`batch_size`, no. of tokens in sequence A/B,
        # `embed_size`)
        # Shape of `f_A`/`f_B`: (`batch_size`, no. of tokens in sequence A/B,
        # `num_hiddens`)
        f_A = self.f(A)
        f_B = self.f(B)
        # Shape of `e`: (`batch_size`, no. of tokens in sequence A,
        # no. of tokens in sequence B)
        e = jnp.matmul(f_A, f_B.transpose(0, 2, 1))
        # Shape of `beta`: (`batch_size`, no. of tokens in sequence A,
        # `embed_size`), where sequence B is softly aligned with each token
        # (axis 1 of `beta`) in sequence A
        beta = jnp.matmul(jax.nn.softmax(e, axis=-1), B)
        # Shape of `alpha`: (`batch_size`, no. of tokens in sequence B,
        # `embed_size`), where sequence A is softly aligned with each token
        # (axis 1 of `alpha`) in sequence B
        alpha = jnp.matmul(jax.nn.softmax(e.transpose(0, 2, 1), axis=-1), A)
        return beta, alpha

Step 2: Compare

For each premise word a_i, run an MLP on [a_i, \beta_i] where \beta_i is the soft-aligned hypothesis context. Same for hypothesis words:

class Compare(nnx.Module):
    def __init__(self, embed_size, num_hiddens, rngs=None):
        self.g = MLP(2 * embed_size, num_hiddens, flatten=False, rngs=rngs)

    def __call__(self, A, B, beta, alpha):
        V_A = self.g(jnp.concatenate([A, beta], axis=2))
        V_B = self.g(jnp.concatenate([B, alpha], axis=2))
        return V_A, V_B

Step 3: Aggregate

Sum the per-token compared vectors → concat the two sentence summaries → final MLP → 3-way logits:

class Aggregate(nnx.Module):
    def __init__(self, num_hiddens, num_outputs, rngs=None):
        rngs = nnx.Rngs(params=0, dropout=1) if rngs is None else rngs
        self.h = MLP(2 * num_hiddens, num_hiddens, flatten=True, rngs=rngs)
        self.output = nnx.Linear(num_hiddens, num_outputs, rngs=rngs)

    def __call__(self, V_A, V_B):
        # Sum up both sets of comparison vectors
        V_A = V_A.sum(axis=1)
        V_B = V_B.sum(axis=1)
        # Feed the concatenation of both summarization results into an MLP
        Y_hat = self.output(self.h(jnp.concatenate([V_A, V_B], axis=1)))
        return Y_hat

Putting it together

The final module wires the three stages into one classifier. Inputs are premise IDs and hypothesis IDs; output is 3 logits for entailment, contradiction, and neutral.

class DecomposableAttention(nnx.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, rngs=None):
        rngs = nnx.Rngs(params=0, dropout=1) if rngs is None else rngs
        self.embedding = nnx.Embed(vocab_size, embed_size, rngs=rngs)
        self.attend = Attend(embed_size, num_hiddens, rngs=rngs)
        self.compare = Compare(embed_size, num_hiddens, rngs=rngs)
        self.aggregate = Aggregate(num_hiddens, 3, rngs=rngs)

    def __call__(self, premises, hypotheses):
        A = self.embedding(premises)
        B = self.embedding(hypotheses)
        beta, alpha = self.attend(A, B)
        V_A, V_B = self.compare(A, B, beta, alpha)
        # There are 3 possible outputs: entailment, contradiction, and neutral
        Y_hat = self.aggregate(V_A, V_B)
        return Y_hat

Loading data + model

SNLI examples are padded premise/hypothesis pairs. Initialize the model with GloVe embeddings, then train all MLP stages end-to-end:

batch_size, num_steps = 512, 32
train_iter, test_iter, vocab = d2l.load_data_snli(batch_size, num_steps)

read 549367 examples
read 9824 examples

embed_size, num_hiddens, devices = 100, 200, d2l.try_all_gpus()
net = DecomposableAttention(len(vocab), embed_size, num_hiddens)
glove_embedding = d2l.TokenEmbedding('glove.6b.100d')
embeds = glove_embedding[vocab.idx_to_token]

Training

Loss should fall quickly: there is no recurrence, so every token-pair alignment and every MLP comparison is fully parallelizable.

lr, num_epochs = 0.001, 4
net.embedding.embedding[...] = jnp.array(embeds)
optimizer = nnx.Optimizer(net, optax.adam(lr), wrt=nnx.Param)
train_net = nnx.view(net, deterministic=False)
eval_net = nnx.view(net, deterministic=True)

@nnx.jit
def train_step(net, optimizer, premises, hypotheses, labels):
    def loss_fn(model):
        logits = model(premises, hypotheses)
        return optax.softmax_cross_entropy_with_integer_labels(
            logits, labels).mean()
    loss, grads = nnx.value_and_grad(loss_fn)(net)
    optimizer.update(net, grads)
    return loss

@nnx.jit
def eval_step(net, premises, hypotheses, labels):
    logits = net(premises, hypotheses)
    return (logits.argmax(axis=-1) == labels).sum()

timer = d2l.Timer()
for epoch in range(num_epochs):
    batch_losses, n_train = [], 0
    for batch in train_iter:
        premises, hypotheses, labels = batch[0], batch[1], batch[2]
        batch_losses.append(train_step(
            train_net, optimizer, premises, hypotheses, labels))
        n_train += len(labels)
    # Synchronize once per epoch, rather than once per minibatch.
    train_loss = float(jnp.stack(batch_losses).mean())
    # Evaluate on test set
    batch_correct, n_test = [], 0
    for batch in test_iter:
        premises, hypotheses, labels = batch[0], batch[1], batch[2]
        batch_correct.append(eval_step(
            eval_net, premises, hypotheses, labels))
        n_test += len(labels)
    n_correct = int(jnp.stack(batch_correct).sum())
    print(f'epoch {epoch + 1}, loss {train_loss:.4f}, '
          f'test acc {n_correct / n_test:.4f}')
print(f'{num_epochs * n_train / timer.stop():.1f} examples/sec')

epoch 1, loss 0.8543, test acc 0.7414
epoch 2, loss 0.6290, test acc 0.7910
epoch 3, loss 0.5628, test acc 0.8029
epoch 4, loss 0.5267, test acc 0.8184
937.5 examples/sec

Predict

Read the examples semantically: “he is good” follows from “he is great”, while “he is bad” contradicts it. The model’s label mapping should reflect that ordering.

def predict_snli(net, vocab, premise, hypothesis):
    """Predict the logical relationship between the premise and hypothesis."""
    premise = jnp.array(vocab[premise]).reshape((1, -1))
    hypothesis = jnp.array(vocab[hypothesis]).reshape((1, -1))
    label = jnp.argmax(nnx.view(net, deterministic=True)(premise, hypothesis),
                       axis=1)
    return 'entailment' if label == 0 else 'contradiction' if label == 1 \
            else 'neutral'

predict_snli(net, vocab, ['he', 'is', 'good', '.'],
             ['he', 'is', 'bad', '.'])

'contradiction'

Recap

Decomposable Attention does NLI in three small MLP stages: attend, compare, aggregate.
No recurrence — completely parallelizable; trains fast even before GPU acceleration was abundant.
A precursor to the cross-attention machinery that BERT (next deck) does end-to-end inside one Transformer encoder.