%matplotlib inline
from d2l import mxnet as d2l
from mxnet import gluon, image, np, npx
import os
npx.set_np()Semantic segmentation assigns a class label to every pixel, not just to the image as a whole. Output shape = input shape; output channels = number of classes.
Two related tasks to keep distinct:
Semantic segmentation: pixel-level labels for dog, cat, background.
This deck sets up the PASCAL VOC 2012 dataset and the data plumbing for FCN training (next deck).
The download gives paired directories: JPEG images and segmentation masks. The important invariant is one RGB mask per input image, with matching spatial dimensions.
Inputs are RGB images; labels are RGB images too — the class is encoded in the color, not in a 1-channel id tensor:
def read_voc_images(voc_dir, is_train=True):
"""Read all VOC feature and label images."""
txt_fname = os.path.join(voc_dir, 'ImageSets', 'Segmentation',
'train.txt' if is_train else 'val.txt')
with open(txt_fname, 'r') as f:
images = f.read().split()
features, labels = [], []
for i, fname in enumerate(images):
features.append(image.imread(os.path.join(
voc_dir, 'JPEGImages', f'{fname}.jpg')))
labels.append(image.imread(os.path.join(
voc_dir, 'SegmentationClass', f'{fname}.png')))
return features, labels
train_features, train_labels = read_voc_images(voc_dir, True)Build a lookup table from the 21 RGB triplets to class indices 0–20. After conversion, each label pixel is an integer target for cross-entropy:
VOC_COLORMAP = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
[0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
[64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
[64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128],
[0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
[0, 64, 128]]
VOC_CLASSES = ['background', 'aeroplane', 'bicycle', 'bird', 'boat',
'bottle', 'bus', 'car', 'cat', 'chair', 'cow',
'diningtable', 'dog', 'horse', 'motorbike', 'person',
'potted plant', 'sheep', 'sofa', 'train', 'tv/monitor']def voc_colormap2label():
"""Build the mapping from RGB to class indices for VOC labels."""
colormap2label = np.zeros(256 ** 3)
for i, colormap in enumerate(VOC_COLORMAP):
colormap2label[
(colormap[0] * 256 + colormap[1]) * 256 + colormap[2]] = i
return colormap2label
def voc_label_indices(colormap, colormap2label):
"""Map any RGB values in VOC labels to their class indices."""
colormap = colormap.astype(np.int32)
idx = ((colormap[:, :, 0] * 256 + colormap[:, :, 1]) * 256
+ colormap[:, :, 2])
return colormap2label[idx]Standard image preprocessing resizes — but resizing the label would interpolate class IDs, which is meaningless. Use random crop on both image and label, with the same random window:
Drops images smaller than the crop size; converts RGB labels to class-index tensors during __getitem__:
class VOCSegDataset(gluon.data.Dataset):
"""A customized dataset to load the VOC dataset."""
def __init__(self, is_train, crop_size, voc_dir):
self.rgb_mean = np.array([0.485, 0.456, 0.406])
self.rgb_std = np.array([0.229, 0.224, 0.225])
self.crop_size = crop_size
features, labels = read_voc_images(voc_dir, is_train=is_train)
self.features = [self.normalize_image(feature)
for feature in self.filter(features)]
self.labels = self.filter(labels)
self.colormap2label = voc_colormap2label()
print('read ' + str(len(self.features)) + ' examples')
def normalize_image(self, img):
return (img.astype('float32') / 255 - self.rgb_mean) / self.rgb_std
def filter(self, imgs):
return [img for img in imgs if (
img.shape[0] >= self.crop_size[0] and
img.shape[1] >= self.crop_size[1])]
def __getitem__(self, idx):
feature, label = voc_rand_crop(self.features[idx], self.labels[idx],
*self.crop_size)
return (feature.transpose(2, 0, 1),
voc_label_indices(label, self.colormap2label))
def __len__(self):
return len(self.features)The printed shapes should show image tensors with a channel axis, but label tensors with only (batch, H, W). The label has no channel dimension because each pixel stores one class id.
def load_data_voc(batch_size, crop_size):
"""Load the VOC semantic segmentation dataset."""
voc_dir = d2l.download_extract('voc2012', os.path.join(
'VOCdevkit', 'VOC2012'))
num_workers = d2l.get_dataloader_workers()
train_iter = gluon.data.DataLoader(
VOCSegDataset(True, crop_size, voc_dir), batch_size,
shuffle=True, last_batch='discard', num_workers=num_workers)
test_iter = gluon.data.DataLoader(
VOCSegDataset(False, crop_size, voc_dir), batch_size,
last_batch='discard', num_workers=num_workers)
return train_iter, test_iter(image, label) loader the next deck (FCN) trains on.