Image Classification (CIFAR-10) on Kaggle

Kaggle CIFAR-10

A capstone deck: assemble everything from the chapter (augmentation, fine-tuning, modern CNN architectures) and take a Kaggle competition. CIFAR-10 has been done to death, but it’s the right size for a teaching example — small enough to fit in memory, big enough that augmentation and ensembling matter.

Kaggle CIFAR-10 competition page.

import collections
from d2l import jax as d2l
import jax
from jax import numpy as jnp
from flax import nnx
import optax
import numpy as np
import tensorflow as tf
import math
import os
import pandas as pd
import shutil

Downloading

Tiny demo subset for the book; swap in the full dataset for the actual competition:

d2l.DATA_HUB['cifar10_tiny'] = (d2l.DATA_URL + 'kaggle_cifar10_tiny.zip',
                                '2068874e4b9a9f0fb07ebe0ad2b29754449ccacd')

# If you use the full dataset downloaded for the Kaggle competition, set
# `demo` to False
demo = True

if demo:
    data_dir = d2l.download_extract('cifar10_tiny')
else:
    data_dir = '../data/cifar-10/'

Organizing the dataset

Kaggle ships everything in one folder; most vision toolkits expect the folder-per-class layout (train/<class>/img.png). Build that layout from the labels.csv:

def read_csv_labels(fname):
    """Read `fname` to return a filename to label dictionary."""
    with open(fname, 'r') as f:
        # Skip the file header line (column name)
        lines = f.readlines()[1:]
    tokens = [l.rstrip().split(',') for l in lines]
    return dict(((name, label) for name, label in tokens))

labels = read_csv_labels(os.path.join(data_dir, 'trainLabels.csv'))
print('# training examples:', len(labels))
print('# classes:', len(set(labels.values())))

# training examples: 1000
# classes: 10

def copyfile(filename, target_dir):
    """Copy a file into a target directory."""
    os.makedirs(target_dir, exist_ok=True)
    shutil.copy(filename, target_dir)


def reorg_train_valid(data_dir, labels, valid_ratio):
    """Split the validation set out of the original training set."""
    # The number of examples of the class that has the fewest examples in the
    # training dataset
    n = collections.Counter(labels.values()).most_common()[-1][1]
    # The number of examples per class for the validation set
    n_valid_per_label = max(1, math.floor(n * valid_ratio))
    label_count = {}
    for train_file in os.listdir(os.path.join(data_dir, 'train')):
        label = labels[train_file.split('.')[0]]
        fname = os.path.join(data_dir, 'train', train_file)
        copyfile(fname, os.path.join(data_dir, 'train_valid_test',
                                     'train_valid', label))
        if label not in label_count or label_count[label] < n_valid_per_label:
            copyfile(fname, os.path.join(data_dir, 'train_valid_test',
                                         'valid', label))
            label_count[label] = label_count.get(label, 0) + 1
        else:
            copyfile(fname, os.path.join(data_dir, 'train_valid_test',
                                         'train', label))
    return n_valid_per_label

def reorg_test(data_dir):
    """Organize the testing set for data loading during prediction."""
    for test_file in os.listdir(os.path.join(data_dir, 'test')):
        copyfile(os.path.join(data_dir, 'test', test_file),
                 os.path.join(data_dir, 'train_valid_test', 'test',
                              'unknown'))

Run the reorg

def reorg_cifar10_data(data_dir, valid_ratio):
    labels = read_csv_labels(os.path.join(data_dir, 'trainLabels.csv'))
    reorg_train_valid(data_dir, labels, valid_ratio)
    reorg_test(data_dir)

batch_size = 32 if demo else 128
valid_ratio = 0.1
reorg_cifar10_data(data_dir, valid_ratio)

Augmentation pipelines

Standard recipe — random crop, flip, normalize for train; just normalize for eval:

CIFAR_MEAN = np.array([0.4914, 0.4822, 0.4465], dtype=np.float32)
CIFAR_STD = np.array([0.2023, 0.1994, 0.2010], dtype=np.float32)

def transform_train_fn(image, label):
    """Training augmentation: resize, random crop, flip, normalize."""
    image = tf.cast(image, tf.float32)
    image = tf.image.resize(image, [40, 40])
    image = tf.image.random_crop(image, size=[32, 32, 3])
    image = tf.image.random_flip_left_right(image)
    image = image / 255.0
    image = (image - CIFAR_MEAN) / CIFAR_STD
    return image, label

def transform_test_fn(image, label):
    """Test preprocessing: normalize only."""
    image = tf.cast(image, tf.float32) / 255.0
    image = (image - CIFAR_MEAN) / CIFAR_STD
    return image, label

# Test transform is defined above as transform_test_fn

Data loaders

Folder-based dataset + the augmentation pipelines:

def _load_image_folder_tf(folder_path):
    """Load images from a class-subfolder directory into a tf.data.Dataset
    of (image, label) where image is uint8 [H, W, 3] and label is int."""
    ds = tf.keras.utils.image_dataset_from_directory(
        folder_path, label_mode='int', image_size=(32, 32),
        batch_size=None, shuffle=False)
    return ds

train_ds = _load_image_folder_tf(
    os.path.join(data_dir, 'train_valid_test', 'train'))
train_valid_ds = _load_image_folder_tf(
    os.path.join(data_dir, 'train_valid_test', 'train_valid'))
valid_ds = _load_image_folder_tf(
    os.path.join(data_dir, 'train_valid_test', 'valid'))
test_ds = _load_image_folder_tf(
    os.path.join(data_dir, 'train_valid_test', 'test'))

train_iter = (train_ds.map(transform_train_fn, num_parallel_calls=tf.data.AUTOTUNE)
              .shuffle(10000).batch(batch_size, drop_remainder=True)
              .prefetch(tf.data.AUTOTUNE))
train_valid_iter = (train_valid_ds.map(transform_train_fn,
                    num_parallel_calls=tf.data.AUTOTUNE)
                    .shuffle(10000).batch(batch_size, drop_remainder=True)
                    .prefetch(tf.data.AUTOTUNE))
valid_iter = (valid_ds.map(transform_test_fn, num_parallel_calls=tf.data.AUTOTUNE)
              .batch(batch_size, drop_remainder=True)
              .prefetch(tf.data.AUTOTUNE))
test_iter = (test_ds.map(transform_test_fn, num_parallel_calls=tf.data.AUTOTUNE)
             .batch(batch_size, drop_remainder=False)
             .prefetch(tf.data.AUTOTUNE))

ResNet-18 residual block

No transfer learning this time — CIFAR-10 is small enough to train from scratch. The core unit is the same residual block from the ResNet chapter: two 3×3 convs plus an identity or 1×1 projection shortcut.

Assembling ResNet-18

Four residual stages progressively downsample the image and widen channels. Global average pooling removes spatial dimensions; the final dense layer emits 10 class logits:

Framework model contract

Across frameworks, get_net returns the same contract: input minibatches of CIFAR-10 images, output logits with shape (batch, 10), and cross-entropy as the training loss.

class Residual(nnx.Module):
    def __init__(self, in_channels, num_channels, use_1x1conv=False,
                 strides=1, *, rngs):
        stride = (strides, strides)
        self.conv1 = nnx.Conv(in_channels, num_channels, (3, 3),
                              strides=stride, padding='same', rngs=rngs)
        self.bn1 = nnx.BatchNorm(num_channels, rngs=rngs)
        self.conv2 = nnx.Conv(num_channels, num_channels, (3, 3),
                              padding='same', rngs=rngs)
        self.bn2 = nnx.BatchNorm(num_channels, rngs=rngs)
        self.conv3 = (nnx.Conv(in_channels, num_channels, (1, 1),
                               strides=stride, rngs=rngs)
                      if use_1x1conv else None)

    def __call__(self, X):
        Y = nnx.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3 is not None:
            X = self.conv3(X)
        return nnx.relu(Y + X)

class ResNet18(nnx.Module):
    def __init__(self, num_classes=10, *, rngs):
        self.stem = nnx.List([
            nnx.Conv(3, 64, (3, 3), padding='same', rngs=rngs),
            nnx.BatchNorm(64, rngs=rngs), nnx.relu])
        channels = [64, 128, 256, 512]
        blocks, in_channels = [], 64
        for i, num_channels in enumerate(channels):
            for j in range(2):
                strides = 2 if i > 0 and j == 0 else 1
                use_1x1conv = i > 0 and j == 0
                blocks.append(Residual(in_channels, num_channels,
                                       use_1x1conv, strides, rngs=rngs))
                in_channels = num_channels
        self.blocks = nnx.List(blocks)
        self.head = nnx.Linear(512, num_classes, rngs=rngs)

    def __call__(self, X):
        for layer in self.stem:
            X = layer(X)
        for block in self.blocks:
            X = block(X)
        X = jnp.mean(X, axis=(1, 2))  # Global average pooling
        return self.head(X)

def get_net():
    return ResNet18(num_classes=10, rngs=nnx.Rngs(0))

def loss_fn(logits, labels):
    return optax.softmax_cross_entropy_with_integer_labels(logits, labels)

Training function

SGD with momentum + weight decay + LR step decay is the classic small-image vision recipe. The long helper mainly adapts that recipe to each framework, so teach the invariant loop:

augment and load a minibatch;
compute logits and cross-entropy;
backpropagate with momentum and weight decay;
step the learning-rate schedule;
log validation accuracy for model selection.

Train

Use the validation split for model selection. Training loss should decline smoothly; validation accuracy is the signal for whether augmentation and the learning-rate schedule are helping rather than just fitting the train set.

num_epochs, lr, wd = 20, 0.005, 5e-4
lr_period, lr_decay = 4, 0.9
net = get_net()
net = train(net, train_iter, valid_iter, num_epochs, lr, wd, lr_period,
            lr_decay)

train loss 0.715, train acc 0.749, valid acc 0.312
1177.5 examples/sec

Submit predictions

Run on the test set, write a Kaggle-format CSV:

net, preds = get_net(), []
net = train(net, train_valid_iter, None, num_epochs, lr, wd, lr_period,
            lr_decay)

for X, _ in test_iter:
    X_jax = jnp.array(X.numpy())  # Already NHWC from tf.data
    y_hat = nnx.view(net, use_running_average=True)(X_jax)
    preds.extend(np.array(y_hat.argmax(axis=-1)))
# Get class names from the train_valid dataset directory
class_names = sorted(os.listdir(
    os.path.join(data_dir, 'train_valid_test', 'train_valid')))
sorted_ids = list(range(1, sum(1 for _ in test_ds) + 1))
sorted_ids.sort(key=lambda x: str(x))
df = pd.DataFrame({'id': sorted_ids, 'label': preds})
df['label'] = df['label'].apply(lambda x: class_names[x])
df.to_csv('submission.csv', index=False)

train loss 0.631, train acc 0.777
1380.0 examples/sec

Recap

Real competition setup: download → reorganize files → augment → train → predict → submit.
Augmentation matters more than model tweaks at the CIFAR-10 scale.
ResNet-18 from scratch + standard recipe is a strong baseline; the chapter techniques (mixup, cutmix, cosine schedule, longer training) push it higher.
This pipeline scales to ImageNet — only the model size and training time change.