from d2l import mxnet as d2l
from mxnet import autograd, gluon, np, npx
from mxnet.gluon import nn
import mxnet as mx
npx.set_np()Matrix factorization — the recommender baseline that everything else competes against. Treat the rating matrix \mathbf{R} \in \mathbb{R}^{m \times n} (users × items) as a low-rank product:
\mathbf{R} \approx \mathbf{P}\mathbf{Q}^\top,\quad \mathbf{P} \in \mathbb{R}^{m \times k},\; \mathbf{Q} \in \mathbb{R}^{n \times k}.
User u gets a k-dim latent vector \mathbf{p}_u; item i gets \mathbf{q}_i. Predicted rating is the dot product, plus bias terms:
\hat r_{ui} = \mathbf{p}_u^\top \mathbf{q}_i + b_u + b_i.
Training minimizes squared error on observed ratings plus regularization:
\mathcal{L} = \sum_{(u,i)\in\Omega}(r_{ui}-\hat r_{ui})^2 + \lambda(\|\mathbf{P}\|_F^2+\|\mathbf{Q}\|_F^2 + \|\mathbf{b}^{user}\|_2^2+\|\mathbf{b}^{item}\|_2^2).
Famously won the Netflix Prize era (Koren et al., 2009). Still a strong baseline; deep models add capacity on top.
Two embedding tables + per-user / per-item bias:
class MF(nn.Block):
def __init__(self, num_factors, num_users, num_items):
super().__init__()
self.P = nn.Embedding(input_dim=num_users, output_dim=num_factors)
self.Q = nn.Embedding(input_dim=num_items, output_dim=num_factors)
self.user_bias = nn.Embedding(num_users, 1)
self.item_bias = nn.Embedding(num_items, 1)
def forward(self, user_id, item_id):
P_u = self.P(user_id)
Q_i = self.Q(item_id)
b_u = self.user_bias(user_id)
b_i = self.item_bias(item_id)
outputs = (P_u * Q_i).sum(axis=1) + np.squeeze(b_u) + np.squeeze(b_i)
return outputs.flatten()Standard rating-prediction metric:
\text{RMSE} = \sqrt{\frac{1}{|\mathcal{T}|} \sum_{(u,i) \in \mathcal{T}} (r_{ui} - \hat r_{ui})^2}.
def evaluator(net, test_iter, devices):
rmse = mx.gluon.metric.RMSE() # Get the RMSE
rmse_list = []
for idx, (users, items, ratings) in enumerate(test_iter):
u = gluon.utils.split_and_load(users, devices, even_split=False)
i = gluon.utils.split_and_load(items, devices, even_split=False)
r_ui = gluon.utils.split_and_load(ratings, devices, even_split=False)
r_hat = [net(u, i) for u, i in zip(u, i)]
rmse.update(labels=r_ui, preds=r_hat)
rmse_list.append(rmse.get()[1])
return float(np.mean(np.array(rmse_list)))Adam on MSE loss with \ell_2 weight decay (regularizes the embedding magnitudes — important for unobserved (u, i) pairs):
def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
devices=d2l.try_all_gpus(), evaluator=None,
**kwargs):
timer = d2l.Timer()
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2],
legend=['train loss', 'test RMSE'])
for epoch in range(num_epochs):
metric, l = d2l.Accumulator(3), 0.
for i, values in enumerate(train_iter):
timer.start()
input_data = []
values = values if isinstance(values, list) else [values]
for v in values:
input_data.append(gluon.utils.split_and_load(v, devices))
train_feat = input_data[:-1] if len(values) > 1 else input_data
train_label = input_data[-1]
with autograd.record():
preds = [net(*t) for t in zip(*train_feat)]
ls = [loss(p, s) for p, s in zip(preds, train_label)]
[l.backward() for l in ls]
l += sum([l.asnumpy() for l in ls]).mean() / len(devices)
trainer.step(values[0].shape[0])
metric.add(l, values[0].shape[0], values[0].size)
timer.stop()
if len(kwargs) > 0: # It will be used in section AutoRec
test_rmse = evaluator(net, test_iter, kwargs['inter_mat'],
devices)
else:
test_rmse = evaluator(net, test_iter, devices)
train_l = l / (i + 1)
animator.add(epoch + 1, (train_l, test_rmse))
print(f'train loss {metric[0] / metric[1]:.3f}, '
f'test RMSE {test_rmse:.3f}')
print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
f'on {str(devices)}')Initialize the embedding tables, run rating prediction training, then interpret the final RMSE as average prediction error on the 1-5 rating scale:
devices = d2l.try_all_gpus()
num_users, num_items, train_iter, test_iter = d2l.split_and_load_ml100k(
test_ratio=0.1, batch_size=512)
net = MF(30, num_users, num_items)
net.initialize(ctx=devices, force_reinit=True, init=mx.init.Normal(0.01))
lr, num_epochs, wd, optimizer = 0.002, 20, 1e-5, 'adam'
# `gluon.loss.L2Loss()` returns 0.5 * MSE, so with identical `lr` MX would
# train at half the effective gradient of PT's `nn.MSELoss()`. Scale by 2
# to match PyTorch's mean-MSE convention.
loss = gluon.loss.L2Loss(weight=2)
trainer = gluon.Trainer(net.collect_params(), optimizer,
{"learning_rate": lr, 'wd': wd})
train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
devices, evaluator)