from d2l import tensorflow as d2l
import tensorflow as tf
from tensorflow import kerasWith the model (last deck) and the data (deck before that), we can finally pretrain a small BERT end-to-end. This deck does it on a tiny scale: 2 layers, 128 hidden dim, 2 heads. The recipe scales to BERT-Base (12 layers, 768 dim, 12 heads) and BERT-Large by just changing the config.
Each batch supplies tokens, segment IDs, valid lengths, masked positions/labels, MLM weights, and NSP labels:
The notebook uses a deliberately small encoder so the full pretraining loop is runnable in class:
Initialize the optimizer/trainer for this tiny BERT. Scaling to BERT-Base changes only the data size, model width/depth, and compute budget:
Two heads, one combined loss:
\mathcal{L} = \mathcal{L}_\text{MLM} + \mathcal{L}_\text{NSP}.
MLM cross-entropy averaged over masked positions; NSP binary cross-entropy on the <cls> head:
# Construct loss functions once at module scope; re-instantiating per batch
# is wasteful.
_mlm_loss_fn = keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')
_nsp_loss_fn = keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')
def _get_batch_loss_bert(net, vocab_size, tokens_X, segments_X,
valid_lens_x, pred_positions_X, mlm_weights_X,
mlm_Y, nsp_y, training=True):
# Forward pass
_, mlm_Y_hat, nsp_Y_hat = net(
tokens_X, segments_X, tf.cast(tf.reshape(valid_lens_x, [-1]),
dtype=tf.float32),
pred_positions_X, training=training)
# Compute masked language model loss (mask per-token losses before summing)
mlm_l = _mlm_loss_fn(tf.reshape(mlm_Y, [-1]),
tf.reshape(mlm_Y_hat, [-1, vocab_size]))
mlm_l = tf.reduce_sum(mlm_l * tf.reshape(mlm_weights_X, [-1])) / (
tf.reduce_sum(mlm_weights_X) + 1e-8)
# Compute next sentence prediction loss
nsp_l = tf.reduce_mean(_nsp_loss_fn(tf.cast(nsp_y, tf.int32), nsp_Y_hat))
l = mlm_l + nsp_l
return mlm_l, nsp_l, lStandard SGD with warmup; on this tiny corpus a few hundred steps is enough to see both losses drop. MLM loss stays higher than NSP because it predicts a large vocabulary rather than a binary label:
def train_bert(train_iter, net, vocab_size, devices, num_steps):
optimizer = keras.optimizers.Adam(learning_rate=1e-4)
step, timer = 0, d2l.Timer()
animator = d2l.Animator(xlabel='step', ylabel='loss',
xlim=[1, num_steps], legend=['mlm', 'nsp'])
# Sum of masked language modeling losses, sum of next sentence prediction
# losses, no. of sentence pairs, count
metric = d2l.Accumulator(4)
num_steps_reached = False
while step < num_steps and not num_steps_reached:
for (tokens_X, segments_X, valid_lens_x, pred_positions_X,
mlm_weights_X, mlm_Y, nsp_y) in train_iter:
timer.start()
with tf.GradientTape() as tape:
mlm_l, nsp_l, l = _get_batch_loss_bert(
net, vocab_size, tokens_X, segments_X, valid_lens_x,
pred_positions_X, mlm_weights_X, mlm_Y, nsp_y,
training=True)
grads = tape.gradient(l, net.trainable_variables)
optimizer.apply_gradients(zip(grads, net.trainable_variables))
metric.add(float(mlm_l), float(nsp_l), tokens_X.shape[0], 1)
timer.stop()
animator.add(step + 1,
(metric[0] / metric[3], metric[1] / metric[3]))
step += 1
if step == num_steps:
num_steps_reached = True
break
print(f'MLM loss {metric[0] / metric[3]:.3f}, '
f'NSP loss {metric[1] / metric[3]:.3f}')
print(f'{metric[2] / timer.sum():.1f} sentence pairs/sec on '
f'{str(devices)}')After pretraining, the encoder is the useful part — turn token sequences into contextual representations. The pretraining heads can be discarded for most downstream tasks:
def get_bert_encoding(net, tokens_a, tokens_b=None):
tokens, segments = d2l.get_tokens_and_segments(tokens_a, tokens_b)
token_ids = tf.expand_dims(
tf.constant(vocab[tokens], dtype=tf.int32), axis=0)
segments = tf.expand_dims(
tf.constant(segments, dtype=tf.int32), axis=0)
valid_len = tf.constant([len(tokens)], dtype=tf.float32)
encoded_X, _, _ = net(token_ids, segments, valid_len, training=False)
return encoded_X“a crane is flying” → 6 hidden vectors (one per token, including <cls> and <sep>). Each is contextual — the representation of “crane” depends on its neighbors:
tokens_a = ['a', 'crane', 'is', 'flying']
encoded_text = get_bert_encoding(net, tokens_a)
# Tokens: '<cls>', 'a', 'crane', 'is', 'flying', '<sep>'
encoded_text_cls = encoded_text[:, 0, :]
encoded_text_crane = encoded_text[:, 2, :]
encoded_text.shape, encoded_text_cls.shape, encoded_text_crane[0][:3](TensorShape([1, 6, 128]),
TensorShape([1, 128]),
<tf.Tensor: shape=(3,), dtype=float32, numpy=array([-0.44974235, 1.4124272 , -1.1196436 ], dtype=float32)>)
“a crane driver came” / “he just left”. Same encoder, two-segment input — segment IDs distinguish the two halves inside the same sequence:
tokens_a, tokens_b = ['a', 'crane', 'driver', 'came'], ['he', 'just', 'left']
encoded_pair = get_bert_encoding(net, tokens_a, tokens_b)
# Tokens: '<cls>', 'a', 'crane', 'driver', 'came', '<sep>', 'he', 'just',
# 'left', '<sep>'
encoded_pair_cls = encoded_pair[:, 0, :]
encoded_pair_crane = encoded_pair[:, 2, :]
encoded_pair.shape, encoded_pair_cls.shape, encoded_pair_crane[0][:3](TensorShape([1, 10, 128]),
TensorShape([1, 128]),
<tf.Tensor: shape=(3,), dtype=float32, numpy=array([-0.75254744, 0.96034265, -1.0952494 ], dtype=float32)>)