%matplotlib inline
from d2l import tensorflow as d2l
import tensorflow as tf
import numpy as np
import os
import pandas as pd
from PIL import ImageThe classic detection benchmarks (PASCAL VOC, COCO) are big — too big for a teaching demo. Instead this section uses the banana detection dataset: 1000 images, one banana per image, fixed size, three random parameters (position, scale, rotation).
The point isn’t to push detection accuracy; it’s to walk through the data plumbing every detector needs:
(images, labels) minibatches. Labels have shape (batch, max_objects, 5).The tiny banana dataset is intentionally simple: one class, one object per image, and normalized box coordinates. That keeps the loader visible before SSD adds anchor matching.
Read all images, parse the CSV-style annotation file, return aligned arrays of images and label tensors:
def read_data_bananas(is_train=True):
"""Read the banana detection dataset images and labels."""
from PIL import Image
data_dir = d2l.download_extract('banana-detection')
csv_fname = os.path.join(data_dir, 'bananas_train' if is_train
else 'bananas_val', 'label.csv')
csv_data = pd.read_csv(csv_fname)
csv_data = csv_data.set_index('img_name')
images, targets = [], []
for img_name, target in csv_data.iterrows():
img = Image.open(
os.path.join(data_dir, 'bananas_train' if is_train else
'bananas_val', 'images', f'{img_name}'))
images.append(tf.constant(np.array(img), dtype=tf.float32))
# Here `target` contains (class, upper-left x, upper-left y,
# lower-right x, lower-right y), where all the images have the same
# banana class (index 0)
targets.append(list(target))
return images, tf.expand_dims(tf.constant(targets, dtype=tf.float32),
axis=1) / 256Wrap the loader in a framework-native Dataset so we get a standard DataLoader:
class BananasDataset:
"""A customized dataset to load the banana detection dataset."""
def __init__(self, is_train):
self.features, self.labels = read_data_bananas(is_train)
print('read ' + str(len(self.features)) + (f' training examples' if
is_train else f' validation examples'))
def __getitem__(self, idx):
return (self.features[idx], self.labels[idx])
def __len__(self):
return len(self.features)The batch shape check should show why detection labels need an extra object dimension: images have the usual framework layout, while labels are (batch, max_objects, 5).
def load_data_bananas(batch_size):
"""Load the banana detection dataset."""
train_dataset = BananasDataset(is_train=True)
val_dataset = BananasDataset(is_train=False)
# Stack images: result shape is (N, H, W, C) — NHWC for TF
train_images = tf.stack(train_dataset.features)
val_images = tf.stack(val_dataset.features)
train_iter = tf.data.Dataset.from_tensor_slices(
(train_images, train_dataset.labels))
# `drop_remainder=True` keeps every training minibatch the same
# shape so the SSD `train_step` (`@tf.function`-wrapped at the call
# site in :numref:`sec_ssd`) traces once per epoch shape instead of
# retracing for the smaller last batch.
train_iter = train_iter.shuffle(len(train_dataset.features)).batch(
batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
val_iter = tf.data.Dataset.from_tensor_slices(
(val_images, val_dataset.labels))
val_iter = val_iter.batch(batch_size).prefetch(tf.data.AUTOTUNE)
return train_iter, val_iterEach label row is (class, x1, y1, x2, y2) with normalized corners. In this dataset max_objects = 1, but the same layout supports variable object counts by padding.

max_objects with -1 class for ignore.