import time
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')
from d2l import tensorflow as d2l
import keras
import numpy as np
from scipy import statsHPO algorithms have a common structure. The next two decks will swap out pieces (parallel scheduling, multi-fidelity). This deck factors out the common skeleton:
Same shape every modern HPO library uses (Optuna, SyneTune, Vizier, Ray Tune).
A concrete RandomSearcher:
class RandomSearcher(HPOSearcher):
def __init__(self, config_space: dict, initial_config=None):
self.save_hyperparameters()
def sample_configuration(self) -> dict:
if self.initial_config is not None:
result = self.initial_config
self.initial_config = None
else:
result = {
name: domain.rvs()
for name, domain in self.config_space.items()
}
return resultConcrete sequential / FIFO scheduler:
Combines searcher + scheduler + objective into a single loop:
class HPOTuner(d2l.HyperParameters):
def __init__(self, scheduler: HPOScheduler, objective: callable):
self.save_hyperparameters()
# Bookkeeping results for plotting
self.incumbent = None
self.incumbent_error = None
self.incumbent_trajectory = []
self.cumulative_runtime = []
self.current_runtime = 0
self.records = []
def run(self, number_of_trials):
for i in range(number_of_trials):
start_time = time.time()
config = self.scheduler.suggest()
print(f"Trial {i}: config = {config}")
error = self.objective(**config)
error = float(error)
self.scheduler.update(config, error)
runtime = time.time() - start_time
self.bookkeeping(config, error, runtime)
print(f" error = {error}, runtime = {runtime}")Track wall-clock time and best-seen objective so we can plot any-time performance later:
@d2l.add_to_class(HPOTuner)
def bookkeeping(self, config: dict, error: float, runtime: float):
self.records.append({"config": config, "error": error, "runtime": runtime})
# Check if the last hyperparameter configuration performs better
# than the incumbent
if self.incumbent is None or self.incumbent_error > error:
self.incumbent = config
self.incumbent_error = error
# Add current best observed performance to the optimization trajectory
self.incumbent_trajectory.append(self.incumbent_error)
# Update runtime
self.current_runtime += runtime
self.cumulative_runtime.append(self.current_runtime)Run the abstraction on a real model — a small CNN on Fashion-MNIST. Search over learning rate, batch size, and network width:
def hpo_objective_lenet(learning_rate, batch_size, max_epochs=10):
import keras
model = keras.Sequential([
keras.layers.Input(shape=(28, 28, 1)),
keras.layers.Conv2D(6, kernel_size=5, padding='same', activation='relu'),
keras.layers.MaxPooling2D(pool_size=2, strides=2),
keras.layers.Conv2D(16, kernel_size=5, activation='relu'),
keras.layers.MaxPooling2D(pool_size=2, strides=2),
keras.layers.Flatten(),
keras.layers.Dense(120, activation='relu'),
keras.layers.Dense(84, activation='relu'),
keras.layers.Dense(10),
])
model.compile(
optimizer=keras.optimizers.SGD(learning_rate=learning_rate),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'],
)
data = d2l.FashionMNIST(batch_size=batch_size)
train_ds = data.get_dataloader(True)
val_ds = data.get_dataloader(False)
history = model.fit(train_ds, epochs=max_epochs, validation_data=val_ds,
verbose=0)
val_acc = history.history['val_accuracy'][-1]
return 1 - val_accThe incumbent curve reports the best validation error found so far as the tuner spends more wall-clock time. Downward steps mean a new configuration beat the previous best; flat regions mean the search is still evaluating but has not improved the incumbent.
Trial 0: config = {'learning_rate': 0.1, 'batch_size': 128}
error = 0.1128000020980835, runtime = 75.96968817710876
Trial 1: config = {'learning_rate': np.float64(0.021230471140226553), 'batch_size': 153}
error = 0.13779997825622559, runtime = 83.06359434127808
Trial 2: config = {'learning_rate': np.float64(0.06615695165442259), 'batch_size': 60}
error = 0.10479998588562012, runtime = 113.53853964805603
Trial 3: config = {'learning_rate': np.float64(0.21510876313424573), 'batch_size': 103}
error = 0.11309999227523804, runtime = 83.84592032432556
Trial 4: config = {'learning_rate': np.float64(0.033130508515860586), 'batch_size': 176}
error = 0.15079998970031738, runtime = 140.96908688545227