def train(net, data_iter, lr, num_epochs, device=d2l.try_gpu()):
net.initialize(ctx=device, force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'adam',
{'learning_rate': lr})
animator = d2l.Animator(xlabel='epoch', ylabel='loss',
xlim=[1, num_epochs])
# Sum of normalized losses, no. of normalized losses
metric = d2l.Accumulator(2)
for epoch in range(num_epochs):
timer, num_batches = d2l.Timer(), len(data_iter)
for i, batch in enumerate(data_iter):
center, context_negative, mask, label = [
data.as_in_ctx(device) for data in batch]
with autograd.record():
pred = skip_gram(center, context_negative, net[0], net[1])
l = (loss(pred.reshape(label.shape), label, mask) *
mask.shape[1] / mask.sum(axis=1))
l.backward()
trainer.step(batch_size)
metric.add(l.sum(), l.size)
if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
animator.add(epoch + (i + 1) / num_batches,
(metric[0] / metric[1],))
print(f'loss {metric[0] / metric[1]:.3f}, '
f'{metric[1] / timer.stop():.1f} tokens/sec on {str(device)}')