from collections import defaultdict
from d2l import mxnet as d2l
from mxnet import gluon, np
import osPure (user, item) collaborative filtering breaks for cold start — new users and new items have no history. Real recommenders integrate side features: item attributes, user profiles, time, device, query context, …
This deck sets up the online advertising CTR prediction problem: predict click probability from a sparse vector of categorical features. Feature-rich recommendation in its purest form. The next two decks (FM and DeepFM) train models on this loader.
Tab-separated; each row has many one-hot categorical fields plus a binary click label. Sparsity is extreme — think “1 of 10000 in each field”:
Build per-field vocabularies, encode each row as a sparse feature index vector, yield (features, label) pairs:
class CTRDataset(gluon.data.Dataset):
def __init__(self, data_path, feat_mapper=None, defaults=None,
min_threshold=4, num_feat=34):
self.NUM_FEATS, self.count, self.data = num_feat, 0, {}
feat_cnts = defaultdict(lambda: defaultdict(int))
self.feat_mapper, self.defaults = feat_mapper, defaults
self.field_dims = np.zeros(self.NUM_FEATS, dtype=np.int64)
with open(data_path) as f:
for line in f:
instance = {}
values = line.rstrip('\n').split('\t')
if len(values) != self.NUM_FEATS + 1:
continue
instance['y'] = [float(values[0])]
for i in range(1, self.NUM_FEATS + 1):
feat_cnts[i][values[i]] += 1
instance.setdefault('x', []).append(values[i])
self.data[self.count] = instance
self.count = self.count + 1
if self.feat_mapper is None and self.defaults is None:
feat_mapper = {i: {feat for feat, c in cnt.items() if c >=
min_threshold} for i, cnt in feat_cnts.items()}
self.feat_mapper = {i: {feat_v: idx for idx, feat_v in enumerate(sorted(feat_values))}
for i, feat_values in feat_mapper.items()}
self.defaults = {i: len(feat_values) for i, feat_values in feat_mapper.items()}
for i, fm in self.feat_mapper.items():
self.field_dims[i - 1] = len(fm) + 1
self.offsets = np.array((0, *np.cumsum(self.field_dims).asnumpy()
[:-1]))
def __len__(self):
return self.count
def __getitem__(self, idx):
feat = np.array([self.feat_mapper[i + 1].get(v, self.defaults[i + 1])
for i, v in enumerate(self.data[idx]['x'])])
# Wrap label in np.array so DataLoader batching yields an ndarray
# (not a list-of-lists), matching the pytorch tab's torch.tensor(...).
return feat + self.offsets, np.array(self.data[idx]['y'])