The MovieLens Dataset

MovieLens Dataset

MovieLens — the canonical recommender systems benchmark. 100k version: 943 users, 1682 movies, 100k explicit ratings on a 1–5 scale. Sparse: only ~6% of the user×item matrix is filled.

This deck loads MovieLens-100k and sets up:

  • Random split — for rating prediction (matrix factorization style).
  • Sequence split — for sequential recommendation (later in the chapter).

Output: minibatches of (user, item, rating) triples. The following decks build matrix factorization, AutoRec, and neural collaborative filtering on top.

Downloading

from d2l import torch as d2l
import numpy as np
import os
import pandas as pd
import random
import torch
d2l.DATA_HUB['ml-100k'] = (
    'https://files.grouplens.org/datasets/movielens/ml-100k.zip',
    'cd4dcac4241c8a4ad7badc7ca635da8a69dddb83')


def read_data_ml100k():
    data_dir = d2l.download_extract('ml-100k')
    names = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv(os.path.join(data_dir, 'u.data'), sep='\t',
                       names=names, engine='python')
    num_users = data.user_id.unique().shape[0]
    num_items = data.item_id.unique().shape[0]
    return data, num_users, num_items

Dataset statistics

Sparsity calculation, rating distribution histogram — the two most informative diagnostics for a recommender dataset:

data, num_users, num_items = read_data_ml100k()
sparsity = 1 - len(data) / (num_users * num_items)
print(f'number of users: {num_users}, number of items: {num_items}')
print(f'matrix sparsity: {sparsity:f}')
print(data.head(5))
number of users: 943, number of items: 1682
matrix sparsity: 0.936953
   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
d2l.plt.hist(data['rating'], bins=5, ec='black')
d2l.plt.xlabel('Rating')
d2l.plt.ylabel('Count')
d2l.plt.title('Distribution of Ratings in MovieLens 100K')
d2l.plt.show()

Train / test split

Two splits to support different evaluation protocols:

  • Random — split rows uniformly. For rating prediction.
  • Seq-aware — hold out the last interaction per user. Closer to “predict what they rate next”.
def split_data_ml100k(data, num_users, num_items,
                      split_mode='random', test_ratio=0.1):
    """Split the dataset in random mode or seq-aware mode."""
    if split_mode == 'seq-aware':
        train_items, test_items, train_list = {}, {}, []
        for line in data.itertuples():
            u, i, rating, time = line[1], line[2], line[3], line[4]
            train_items.setdefault(u, []).append((u, i, rating, time))
            if u not in test_items or test_items[u][-1] < time:
                test_items[u] = (i, rating, time)
        for u in range(1, num_users + 1):
            train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
        test_data = [(key, *value) for key, value in test_items.items()]
        # O(N) set-membership filter instead of O(N^2) list-membership.
        test_set = set(test_data)
        train_data = [item for item in train_list if item not in test_set]
        train_data = pd.DataFrame(train_data)
        test_data = pd.DataFrame(test_data)
    else:
        # Seed for deterministic splits across frameworks; uses Python's
        # `random` for cross-framework portability (some frameworks' numpy
        # shim lacks `np.random.default_rng`).
        rng = random.Random(0)
        mask = [rng.random() < 1 - test_ratio for _ in range(len(data))]
        neg_mask = [not x for x in mask]
        train_data, test_data = data[mask], data[neg_mask]
    return train_data, test_data

DataLoader

def load_data_ml100k(data, num_users, num_items, feedback='explicit'):
    users, items, scores = [], [], []
    inter = np.zeros((num_items, num_users)) if feedback == 'explicit' else {}
    for line in data.itertuples():
        user_index, item_index = int(line[1] - 1), int(line[2] - 1)
        score = int(line[3]) if feedback == 'explicit' else 1
        users.append(user_index)
        items.append(item_index)
        scores.append(score)
        if feedback == 'implicit':
            inter.setdefault(user_index, []).append(item_index)
        else:
            inter[item_index, user_index] = score
    return users, items, scores, inter
def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit',
                          test_ratio=0.1, batch_size=256):
    data, num_users, num_items = read_data_ml100k()
    train_data, test_data = split_data_ml100k(
        data, num_users, num_items, split_mode, test_ratio)
    train_u, train_i, train_r, _ = load_data_ml100k(
        train_data, num_users, num_items, feedback)
    test_u, test_i, test_r, _ = load_data_ml100k(
        test_data, num_users, num_items, feedback)
    train_set = torch.utils.data.TensorDataset(
        torch.tensor(train_u), torch.tensor(train_i),
        torch.tensor(train_r).float())
    test_set = torch.utils.data.TensorDataset(
        torch.tensor(test_u), torch.tensor(test_i),
        torch.tensor(test_r).float())
    train_iter = torch.utils.data.DataLoader(
        train_set, shuffle=True, drop_last=False,
        batch_size=batch_size)
    test_iter = torch.utils.data.DataLoader(
        test_set, batch_size=batch_size)
    return num_users, num_items, train_iter, test_iter

Recap

  • MovieLens-100k: small, well-understood ratings dataset — ideal for teaching, too small for SOTA claims.
  • Sparsity (~94% missing) is the central challenge for every method in this chapter.
  • Output format: (user, item, rating) minibatches + framework-native loaders.