try:
import google.colab
= True
IN_COLAB except:
= False
IN_COLAB
if IN_COLAB:
!pip install -U gymnasium pygame moviepy
!pip install gymnasium[box2d]
Monte-Carlo control
import numpy as np
= np.random.default_rng()
rng import matplotlib.pyplot as plt
import os
import gymnasium as gym
print("gym version:", gym.__version__)
from moviepy.editor import ImageSequenceClip, ipython_display
class GymRecorder(object):
"""
Simple wrapper over moviepy to generate a .gif with the frames of a gym environment.
The environment must have the render_mode `rgb_array_list`.
"""
def __init__(self, env):
self.env = env
self._frames = []
def record(self, frames):
"To be called at the end of an episode."
for frame in frames:
self._frames.append(np.array(frame))
def make_video(self, filename):
"Generates the gif video."
= os.path.dirname(os.path.abspath(filename))
directory if not os.path.exists(directory):
os.mkdir(directory)self.clip = ImageSequenceClip(list(self._frames), fps=self.env.metadata["render_fps"])
self.clip.write_gif(filename, fps=self.env.metadata["render_fps"], loop=0)
del self._frames
self._frames = []
gym version: 0.26.3
The taxi environment
In this exercise, we are going to apply on-policy Monte-Carlo control on the Taxi environment available in gym:
https://gymnasium.farama.org/environments/toy_text/taxi/
Let’s create the environment in ansi mode, initialize it, and render the first state:
= gym.make("Taxi-v3", render_mode='ansi')
env = env.reset()
state, info print(state)
print(env.render())
374
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
The agent is the black square. It can move up, down, left or right if there is no wall (the pipes and dashes). Its goal is to pick clients at the blue location and drop them off at the purple location. These locations are fixed (R, G, B, Y), but which one is the pick-up location and which one is the drop-off destination changes between each episode.
Q: Re-run the previous cell multiple times to observe the diversity of initial states.
The following cell prints the action space of the environment:
print("Action Space:", env.action_space)
print("Number of actions:", env.action_space.n)
Action Space: Discrete(6)
Number of actions: 6
There are 6 discrete actions: south, north, east, west, pickup, dropoff.
Let’s now look at the observation space (state space):
print("State Space:", env.observation_space)
print("Number of states:", env.observation_space.n)
State Space: Discrete(500)
Number of states: 500
There are 500 discrete states. What are they?
- The taxi can be anywhere in the 5x5 grid, giving 25 different locations.
- The passenger can be at any of the four locations R, G, B, Y or in the taxi: 5 values.
- The destination can be any of the four locations: 4 values.
This gives indeed 25x5x4 = 500 different combinations.
The internal representation of a state is a number between 0 and 499. You can use the encode
and decode
methods of the environment to relate it to the state variables.
= env.encode(2, 1, 1, 0) # (taxi row, taxi column, passenger index, destination index)
state print("State:", state)
= env.decode(328)
state print("State:", list(state))
State: 224
State: [3, 1, 2, 0]
The reward function is simple:
- r = 20 when delivering the client at the correct location.
- r = -10 when picking or dropping a client illegally (picking where there is no client, dropping a client somewhere else, etc)
- r = -1 for all other transitions in order to incent the agent to be as fast as possible.
The actions pickup and dropoff are very dangerous: take them at the wrong time and your return will be very low. The navigation actions are less critical.
Depending on the initial state, the taxi will need at least 10 steps to deliver the client, so the maximal return you can expect is around 10 (+20 for the success, -1 for all the steps).
The task is episodic: if you have not delivered the client within 200 steps, the episode stops (no particular reward).
Random agent
Let’s now define a random agent that just samples the action space.
Q: Modify the random agent of last time, so that it accepts the GymRecorder
that generates the .gif file.
def train(self, nb_episodes, recorder=None):
The environment should be started in ‘rgb_array_list’ mode, not ‘ansi’. The game looks different but has the same rules.
= gym.make("Taxi-v3", render_mode='rgb_array_list')
env = GymRecorder(env) recorder
As episodes in Taxi can be quite long, only the last episode should be recorded:
if recorder is not None and episode == nb_episodes -1:
self.env.render()) recorder.record(
Perform 10 episodes, plot the obtained returns and vidualize the last episode.
class RandomAgent:
"""
Random agent exploring uniformly the environment.
"""
def __init__(self, env):
self.env = env
def act(self, state):
"Returns a random action by sampling the action space."
return self.env.action_space.sample()
def update(self, state, action, reward, next_state):
"Updates the agent using the transition (s, a, r, s')."
pass
def train(self, nb_episodes, recorder=None):
"Runs the agent on the environment for nb_episodes. Returns the list of obtained rewards."
# List of returns
= []
returns
for episode in range(nb_episodes):
# Sample the initial state
= self.env.reset()
state, info
= 0.0
return_episode = False
done while not done:
# Select an action randomly
= self.env.action_space.sample()
action
# Sample a single transition
= self.env.step(action)
next_state, reward, terminal, truncated, info
# Go in the next state
= next_state
state
# Update return
+= reward
return_episode
# End of the episode
= terminal or truncated
done
# Record at the end of the episode
if recorder is not None and episode == nb_episodes -1:
self.env.render())
recorder.record(
# Append return
returns.append(return_episode)
return returns
# Create the environment
= gym.make("Taxi-v3", render_mode='rgb_array_list')
env = GymRecorder(env)
recorder
# Create the agent
= RandomAgent(env)
agent
# Train for 10 episodes
= agent.train(10, recorder)
returns
=(15, 6))
plt.figure(figsize
plt.plot(returns)"Episodes")
plt.xlabel("Return")
plt.ylabel(
plt.show()
= "videos/taxi.gif"
video
recorder.make_video(video) ipython_display(video)
MoviePy - Building file videos/taxi.gif with imageio.