from d2l import torch as d2l
import torch
from torch import nn
import randomMatrix factorization treats user history as a bag — order doesn’t matter. But sessions reveal short-term intent that bags miss: someone who just watched two sci-fi movies probably wants a third, regardless of their all-time average preferences.
Caser (Tang & Wang, 2018) — convolutional sequence recommender. Build a user’s recent interactions into a L \times d matrix (last L items × embedding dim); apply horizontal convolutions (capture sequential patterns) and vertical convolutions (capture pointwise patterns); combine with a per-user latent vector to predict the next item.
A bridge between session-based RNN models and collaborative filtering. Combines a “what you’ve been doing recently” signal with a “who you are” signal.
Training uses the same pairwise ranking objective as NeuMF:
\mathcal{L}_{BPR} = -\sum_{(u,i,j)} \log \sigma(\hat y_{uit} - \hat y_{ujt}).
Two parallel CNN branches over the recent-items matrix — horizontal filters scan multi-item sequences, vertical filters mix item embeddings:
Embedding tables + parallel conv branches + per-user MF component → final score:
class Caser(nn.Module):
def __init__(self, num_factors, num_users, num_items, L=5, d=16,
d_prime=4, drop_ratio=0.05):
super().__init__()
self.P = nn.Embedding(num_users, num_factors)
self.Q = nn.Embedding(num_items, num_factors)
self.d_prime, self.d = d_prime, d
# Vertical convolution layer
self.conv_v = nn.Conv2d(1, d_prime, (L, 1))
# Horizontal convolution layer
h = [i + 1 for i in range(L)]
self.conv_h = nn.ModuleList(
[nn.Conv2d(1, d, (i, num_factors)) for i in h])
self.max_pool = nn.ModuleList(
[nn.MaxPool1d(L - i + 1) for i in h])
# Fully connected layer
self.fc1_dim_v, self.fc1_dim_h = d_prime * num_factors, d * len(h)
self.fc = nn.Sequential(
nn.Linear(d_prime * num_factors + d * L, num_factors),
nn.ReLU())
self.Q_prime = nn.Embedding(num_items, num_factors * 2)
self.b = nn.Embedding(num_items, 1)
self.dropout = nn.Dropout(drop_ratio)
def forward(self, user_id, seq, item_id):
item_embs = self.Q(seq).unsqueeze(1)
user_emb = self.P(user_id)
out, out_h, out_v, out_hs = None, None, None, []
if self.d_prime:
out_v = self.conv_v(item_embs)
out_v = out_v.reshape(out_v.shape[0], self.fc1_dim_v)
if self.d:
for conv, maxp in zip(self.conv_h, self.max_pool):
conv_out = torch.relu(conv(item_embs)).squeeze(3)
t = maxp(conv_out)
pool_out = t.squeeze(2)
out_hs.append(pool_out)
out_h = torch.cat(out_hs, dim=1)
out = torch.cat([out_v, out_h], dim=1)
z = self.fc(self.dropout(out))
x = torch.cat([z, user_emb], dim=1)
# batch_size is 4096 here, so bare squeeze (collapsing all singleton
# axes) is safe and produces matched shapes for the positive
# (item_id shape (B,1)) and negative (item_id shape (B,)) paths.
q_prime_i = self.Q_prime(item_id).squeeze()
b = self.b(item_id).squeeze()
res = (x * q_prime_i).sum(1) + b
return resEach example: (user, last-L items, target item, negative target). Per-user sliding windows over their interaction sequence:
class SeqDataset(torch.utils.data.Dataset):
def __init__(self, user_ids, item_ids, L, num_users, num_items,
candidates, test_items=None):
user_ids = torch.tensor(user_ids, dtype=torch.long)
item_ids = torch.tensor(item_ids, dtype=torch.long)
sort_idx = sorted(range(len(user_ids)),
key=lambda k: user_ids[k].item())
sort_idx = torch.tensor(sort_idx, dtype=torch.long)
u_ids, i_ids = user_ids[sort_idx], item_ids[sort_idx]
temp, u_ids_np = {}, u_ids.numpy()
# Precompute each user's negative pool once: items the user has not
# interacted with in train AND not held out as a test positive
# (excluding test positives prevents leakage into the BPR loss).
all_items = set(range(num_items))
test_items = test_items or {}
self.neg_pool = {
u: list(all_items - set(candidates.get(u, [])) - set(test_items.get(u, [])))
for u in candidates}
[temp.setdefault(u_ids_np[i], []).append(i)
for i, _ in enumerate(u_ids_np)]
temp = sorted(temp.items(), key=lambda x: x[0])
u_ids = torch.tensor([i[0] for i in temp], dtype=torch.long)
idx = torch.tensor([i[1][0] for i in temp], dtype=torch.long)
self.ns = ns = int(sum([c - L if c >= L + 1 else 1 for c
in [len(i[1]) for i in temp]]))
self.seq_items = torch.zeros(ns, L, dtype=torch.long)
self.seq_users = torch.zeros(ns, dtype=torch.long)
self.seq_tgt = torch.zeros(ns, 1, dtype=torch.long)
self.test_seq = torch.zeros(num_users, L, dtype=torch.long)
test_users, _uid = torch.empty(num_users), None
for i, (uid, i_seq) in enumerate(self._seq(u_ids, i_ids, idx, L + 1)):
if uid != _uid:
self.test_seq[uid][:] = i_seq[-L:]
test_users[uid], _uid = uid, uid
self.seq_tgt[i][:] = i_seq[-1:]
self.seq_items[i][:], self.seq_users[i] = i_seq[:L], uid
def _win(self, tensor, window_size, step_size=1):
if len(tensor) - window_size >= 0:
for i in range(len(tensor), 0, - step_size):
if i - window_size >= 0:
yield tensor[i - window_size:i]
else:
break
else:
yield tensor
def _seq(self, u_ids, i_ids, idx, max_len):
for i in range(len(idx)):
stop_idx = None if i >= len(idx) - 1 else int(idx[i + 1])
for s in self._win(i_ids[int(idx[i]):stop_idx], max_len):
yield (int(u_ids[i]), s)
def __len__(self):
return self.ns
def __getitem__(self, idx):
neg = self.neg_pool[int(self.seq_users[idx])]
i = random.randint(0, len(neg) - 1)
return (self.seq_users[idx], self.seq_items[idx], self.seq_tgt[idx],
neg[i])The sequence-aware split holds out each user’s most recent interaction. A training row is (user, history, positive, negative), so the model sees both long-term identity and short-term context:
TARGET_NUM, L, batch_size = 1, 5, 4096
df, num_users, num_items = d2l.read_data_ml100k()
train_data, test_data = d2l.split_data_ml100k(df, num_users, num_items,
'seq-aware')
users_train, items_train, ratings_train, candidates = d2l.load_data_ml100k(
train_data, num_users, num_items, feedback="implicit")
users_test, items_test, ratings_test, test_iter = d2l.load_data_ml100k(
test_data, num_users, num_items, feedback="implicit")
train_seq_data = SeqDataset(users_train, items_train, L, num_users,
num_items, candidates, test_items=test_iter)
train_iter = torch.utils.data.DataLoader(train_seq_data, batch_size,
shuffle=True, drop_last=True,
num_workers=d2l.get_dataloader_workers())
test_seq_iter = train_seq_data.test_seq
train_seq_data[0](tensor(0), tensor([241, 170, 110, 255, 4]), tensor([101]), 1597)
Use the same optimizer and BPR loss as NeuMF for a fair comparison. The expensive part is ranking evaluation, which scores many candidate items per user:
devices = d2l.try_all_gpus()
net = Caser(10, num_users, num_items, L)
def _init(m):
# Match MX's `init.Normal(0.01)` semantics: initialize weights *and*
# biases, so biases don't keep their default uniform fan-in init.
if hasattr(m, 'weight') and m.weight is not None:
nn.init.normal_(m.weight, 0, 0.01)
if hasattr(m, 'bias') and m.bias is not None:
nn.init.zeros_(m.bias)
net.apply(_init)
net = net.to(devices[0])
lr, num_epochs, wd, optimizer = 0.04, 8, 1e-5, 'adam'
loss = d2l.BPRLoss()
trainer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=wd)
# `evaluate_ranking` is the bottleneck — it scores every (user, item)
# pair that the user has not interacted with, so per-epoch evaluation
# dominates total runtime. Setting `eval_step=num_epochs` defers
# evaluation to the final epoch only, which keeps the cell well under
# an hour while still reporting hit-rate / AUC for the trained model.
d2l.train_ranking(net, train_iter, test_iter, loss, trainer,
test_seq_iter, num_users, num_items, num_epochs,
devices, d2l.evaluate_ranking, candidates,
eval_step=num_epochs)train loss 0.135, test hit rate 0.347, test AUC 0.883
945.3 examples/sec on [device(type='cuda', index=0)]