from d2l import torch as d2l
import torch
from torch import nnMatrix factorization — the recommender baseline that everything else competes against. Treat the rating matrix \mathbf{R} \in \mathbb{R}^{m \times n} (users × items) as a low-rank product:
\mathbf{R} \approx \mathbf{P}\mathbf{Q}^\top,\quad \mathbf{P} \in \mathbb{R}^{m \times k},\; \mathbf{Q} \in \mathbb{R}^{n \times k}.
User u gets a k-dim latent vector \mathbf{p}_u; item i gets \mathbf{q}_i. Predicted rating is the dot product, plus bias terms:
\hat r_{ui} = \mathbf{p}_u^\top \mathbf{q}_i + b_u + b_i.
Training minimizes squared error on observed ratings plus regularization:
\mathcal{L} = \sum_{(u,i)\in\Omega}(r_{ui}-\hat r_{ui})^2 + \lambda(\|\mathbf{P}\|_F^2+\|\mathbf{Q}\|_F^2 + \|\mathbf{b}^{user}\|_2^2+\|\mathbf{b}^{item}\|_2^2).
Famously won the Netflix Prize era (Koren et al., 2009). Still a strong baseline; deep models add capacity on top.
Two embedding tables + per-user / per-item bias:
class MF(nn.Module):
def __init__(self, num_factors, num_users, num_items):
super().__init__()
self.P = nn.Embedding(num_users, num_factors)
self.Q = nn.Embedding(num_items, num_factors)
self.user_bias = nn.Embedding(num_users, 1)
self.item_bias = nn.Embedding(num_items, 1)
nn.init.normal_(self.P.weight, std=0.01)
nn.init.normal_(self.Q.weight, std=0.01)
nn.init.zeros_(self.user_bias.weight)
nn.init.zeros_(self.item_bias.weight)
def forward(self, user_id, item_id):
P_u = self.P(user_id)
Q_i = self.Q(item_id)
b_u = self.user_bias(user_id)
b_i = self.item_bias(item_id)
outputs = (P_u * Q_i).sum(dim=1) + b_u.squeeze() + b_i.squeeze()
return outputs.flatten()Standard rating-prediction metric:
\text{RMSE} = \sqrt{\frac{1}{|\mathcal{T}|} \sum_{(u,i) \in \mathcal{T}} (r_{ui} - \hat r_{ui})^2}.
def evaluator(net, test_iter, devices):
net.eval()
sq_err_total, n = 0.0, 0
with torch.no_grad():
for users, items, ratings in test_iter:
users, items, ratings = (users.to(devices[0]),
items.to(devices[0]),
ratings.to(devices[0]))
preds = net(users, items)
sq_err_total += ((preds - ratings) ** 2).sum().item()
n += ratings.numel()
return (sq_err_total / n) ** 0.5Adam on MSE loss with \ell_2 weight decay (regularizes the embedding magnitudes — important for unobserved (u, i) pairs):
def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs,
devices=d2l.try_all_gpus(), evaluator=None,
**kwargs):
net = net.to(devices[0])
timer = d2l.Timer()
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2],
legend=['train loss', 'test RMSE'])
for epoch in range(num_epochs):
net.train()
metric, l = d2l.Accumulator(3), 0.
for i, values in enumerate(train_iter):
timer.start()
users, items, ratings = [v.to(devices[0]) for v in values]
trainer.zero_grad()
preds = net(users, items)
ls = loss(preds, ratings)
ls.backward()
trainer.step()
l += ls.item()
metric.add(ls.item() * users.shape[0], users.shape[0],
users.shape[0])
timer.stop()
if len(kwargs) > 0: # It will be used in section AutoRec
test_rmse = evaluator(net, test_iter, kwargs['inter_mat'],
devices)
else:
test_rmse = evaluator(net, test_iter, devices)
train_l = l / (i + 1)
animator.add(epoch + 1, (train_l, test_rmse))
print(f'train loss {metric[0] / metric[1]:.3f}, '
f'test RMSE {test_rmse:.3f}')
print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
f'on {str(devices)}')Initialize the embedding tables, run rating prediction training, then interpret the final RMSE as average prediction error on the 1-5 rating scale:
devices = d2l.try_all_gpus()
num_users, num_items, train_iter, test_iter = d2l.split_and_load_ml100k(
test_ratio=0.1, batch_size=512)
net = MF(30, num_users, num_items)
loss = nn.MSELoss()
trainer = torch.optim.Adam(net.parameters(), lr=0.002, weight_decay=1e-5)
train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs=20,
devices=devices, evaluator=evaluator)train loss 0.644, test RMSE 1.043
382587.0 examples/sec on [device(type='cuda', index=0)]