def q_learning(env_info, gamma, num_iters, alpha, epsilon_start, epsilon_end):
env_desc = env_info['desc'] # 2D array specifying what each grid item means
env = env_info['env'] # 2D array specifying what each grid item means
num_states = env_info['num_states']
num_actions = env_info['num_actions']
Q = np.zeros((num_states, num_actions))
V = np.zeros((num_iters + 1, num_states))
pi = np.zeros((num_iters + 1, num_states))
for k in range(1, num_iters + 1):
# Linearly anneal epsilon over episodes
epsilon = epsilon_start + (epsilon_end - epsilon_start) * (
(k - 1) / max(1, num_iters - 1))
# Reset environment
state, _ = env.reset()
done = False
while not done:
# Select an action for a given state and acts in env based on selected action
action = e_greedy(env, Q, state, epsilon)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
# Q-update: mask the bootstrap term at terminal states so the
# target is just the observed reward when the episode ends.
if done:
y = reward
else:
y = reward + gamma * np.max(Q[next_state,:])
Q[state, action] = Q[state, action] + alpha * (y - Q[state, action])
# Move to the next state
state = next_state
# Record max value and max action for visualization purpose only
for s in range(num_states):
V[k,s] = np.max(Q[s,:])
pi[k,s] = np.argmax(Q[s,:])
d2l.show_Q_function_progress(env_desc, V[:-1], pi[:-1])
q_learning(env_info=env_info, gamma=gamma, num_iters=num_iters, alpha=alpha,
epsilon_start=epsilon_start, epsilon_end=epsilon_end)