## 3. States

`State = [x_pos_taxi, y_pos_taxi, pos_passenger, dest_passenger]`

## 6. Rewards

• `移动：-1`, 表示每一步都会受到一点惩罚，以鼓励从出发地到目的地走最短的路。
• `错误运送：-10`， 表示当乘客被送到到错误的位置时，乘客自然会不高兴，所以惩罚大一些是合适的。
• `成功送达:20`，表示出租车司机成功完成了任务，鼓励相应的行为，因此产生了正向的`reward`

## 7. Initialization

``````"""install libraries"""
!pip install cmake 'gym[atari]' scipy pygame

"""Import libraries"""
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
from IPython.display import clear_output
from time import sleep
from matplotlib import animation``````

``````"""Initialize and validate the environment"""
env = gym.make("Taxi-v3", render_mode="rgb_array").env
state, _ = env.reset()

# Print dimensions of state and action space
print("State space: {}".format(env.observation_space))
print("Action space: {}".format(env.action_space))

# Sample random action
next_state, reward, done, _, _ = env.step(action)

# Print output
print("State: {}".format(state))
print("Action: {}".format(action))
print("Reward: {}".format(reward))

# Render and plot an environment frame
frame = env.render()
plt.imshow(frame)
plt.axis("off")
plt.show()``````

## 8. 测试随机`agent`

``````"""Simulation with random agent"""
epoch = 0
num_failed_dropoffs = 0
experience_buffer = []
cum_reward = 0

done = False

state, _ = env.reset()

while not done:
# Sample random action
action = env.action_space.sample()

state, reward, done, _, _ = env.step(action)
cum_reward += reward

# Store experience in dictionary
experience_buffer.append({
"frame": env.render(),
"episode": 1,
"epoch": epoch,
"state": state,
"action": action,
"reward": cum_reward,
}
)

if reward == -10:
num_failed_dropoffs += 1

epoch += 1

# Run animation and print console output
run_animation(experience_buffer)

print("# epochs: {}".format(epoch))
print("# failed drop-offs: {}".format(num_failed_dropoffs))``````

## 9. 训练`agent`

``````"""Training the agent"""
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 1.0  # Discount rate
epsilon = 0.1  # Exploration rate
num_episodes = 10000  # Number of episodes

# Output for plots
cum_rewards = np.zeros([num_episodes])
total_epochs = np.zeros([num_episodes])

for episode in range(1, num_episodes+1):
# Reset environment
state, info = env.reset()
epoch = 0
num_failed_dropoffs = 0
done = False
cum_reward = 0

while not done:

if random.uniform(0, 1) < epsilon:
"Basic exploration [~0.47m]"
action = env.action_space.sample() # Sample random action (exploration)

else:

"Exploitation with random tie breaker [~1m19s]"
#  action = np.random.choice(np.flatnonzero(q_table[state] == q_table[state].max()))

"Basic exploitation [~47s]"
action = np.argmax(q_table[state]) # Select best known action (exploitation)

next_state, reward, done, _ , info = env.step(action)

cum_reward += reward

old_q_value = q_table[state, action]
next_max = np.max(q_table[next_state])

new_q_value = (1 - alpha) * old_q_value + alpha * (reward + gamma * next_max)

q_table[state, action] = new_q_value

if reward == -10:
num_failed_dropoffs += 1

state = next_state
epoch += 1

total_epochs[episode-1] = epoch
cum_rewards[episode-1] = cum_reward

if episode % 100 == 0:
clear_output(wait=True)
print(f"Episode #: {episode}")

print("\n")
print("===Training completed.===\n")

# Plot reward convergence
plt.title("Cumulative reward per episode")
plt.xlabel("Episode")
plt.ylabel("Cumulative reward")
plt.plot(cum_rewards)
plt.show()

# Plot epoch convergence
plt.title("# epochs per episode")
plt.xlabel("Episode")
plt.ylabel("# epochs")
plt.plot(total_epochs)
plt.show()``````

## 10. 验证训练效果

``````"""Test policy performance after training"""

num_epochs = 0
total_failed_deliveries = 0
num_episodes = 1
experience_buffer = []
store_gif = True

for episode in range(1, num_episodes+1):
# Initialize experience buffer

my_env = env.reset()
state = my_env[0]
epoch = 1
num_failed_deliveries =0
cum_reward = 0
done = False

while not done:
action = np.argmax(q_table[state])
state, reward, done, _, _ = env.step(action)
cum_reward += reward

if reward == -10:
num_failed_deliveries += 1

# Store rendered frame in animation dictionary
experience_buffer.append({
'frame': env.render(),
'episode': episode,
'epoch': epoch,
'state': state,
'action': action,
'reward': cum_reward
}
)

epoch += 1

total_failed_deliveries += num_failed_deliveries
num_epochs += epoch

if store_gif:
store_episode_as_gif(experience_buffer)

# Run animation and print output
run_animation(experience_buffer)

# Print final results
print("\n")
print(f"Test results after {num_episodes} episodes:")
print(f"Mean # epochs per episode: {num_epochs / num_episodes}")
print(f"Mean # failed drop-offs per episode: {total_failed_deliveries / num_episodes}")``````