Skip to content

Commit 0b2e1e3

Browse files
authored
Switch from gym to gymnasium and refresh RL code (#2864)
* Switch from gym to gymnasium and refresh RL code * autopep8 fix --------- Co-authored-by: vfdev-5 <[email protected]>
1 parent 913b60f commit 0b2e1e3

File tree

4 files changed

+148
-92
lines changed

4 files changed

+148
-92
lines changed

examples/reinforcement_learning/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
ported from [pytorch-examples](https://github.com/pytorch/examples/tree/master/reinforcement_learning)
44

55
```bash
6-
pip install gym
6+
pip install gymnasium
77
# For REINFORCE:
88
python reinforce.py
99
# For actor critic:

examples/reinforcement_learning/actor_critic.py

Lines changed: 102 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import argparse
2-
from collections import namedtuple
2+
from collections import deque, namedtuple
33

44
import numpy as np
55
import torch
@@ -11,61 +11,110 @@
1111
from ignite.engine import Engine, Events
1212

1313
try:
14-
import gym
14+
import gymnasium as gym
1515
except ImportError:
16-
raise ModuleNotFoundError("Please install opengym: pip install gym")
16+
raise ModuleNotFoundError("Please install opengym: pip install gymnasium")
1717

1818

1919
SavedAction = namedtuple("SavedAction", ["log_prob", "value"])
2020

21+
eps = np.finfo(np.float32).eps.item()
22+
2123

2224
class Policy(nn.Module):
25+
"""
26+
implements both actor and critic in one model
27+
"""
28+
2329
def __init__(self):
2430
super(Policy, self).__init__()
2531
self.affine1 = nn.Linear(4, 128)
32+
33+
# actor's layer
2634
self.action_head = nn.Linear(128, 2)
35+
36+
# critic's layer
2737
self.value_head = nn.Linear(128, 1)
2838

39+
# action & reward buffer
2940
self.saved_actions = []
3041
self.rewards = []
3142

3243
def forward(self, x):
44+
"""
45+
forward of both actor and critic
46+
"""
3347
x = F.relu(self.affine1(x))
34-
action_scores = self.action_head(x)
48+
49+
# actor: choses action to take from state s_t
50+
# by returning probability of each action
51+
action_prob = F.softmax(self.action_head(x), dim=-1)
52+
53+
# critic: evaluates being in the state s_t
3554
state_values = self.value_head(x)
36-
return F.softmax(action_scores, dim=-1), state_values
55+
56+
# return values for both actor and critic as a tuple of 2 values:
57+
# 1. a list with the probability of each action over the action space
58+
# 2. the value from state s_t
59+
return action_prob, state_values
3760

3861

39-
def select_action(model, observation):
62+
def select_action(policy, observation):
4063
observation = torch.from_numpy(observation).float()
41-
probs, observation_value = model(observation)
64+
probs, observation_value = policy(observation)
65+
# create a categorical distribution over the list of probabilities of actions
4266
m = Categorical(probs)
67+
68+
# and sample an action using the distribution
4369
action = m.sample()
44-
model.saved_actions.append(SavedAction(m.log_prob(action), observation_value))
70+
71+
# save to action buffer
72+
policy.saved_actions.append(SavedAction(m.log_prob(action), observation_value))
73+
74+
# the action to take (left or right)
4575
return action.item()
4676

4777

48-
def finish_episode(model, optimizer, gamma, eps):
78+
def finish_episode(policy, optimizer, gamma):
79+
"""
80+
Training code. Calculates actor and critic loss and performs backprop.
81+
"""
4982
R = 0
50-
saved_actions = model.saved_actions
51-
policy_losses = []
52-
value_losses = []
53-
rewards = []
54-
for r in model.rewards[::-1]:
83+
saved_actions = policy.saved_actions
84+
policy_losses = [] # list to save actor (policy) loss
85+
value_losses = [] # list to save critic (value) loss
86+
returns = deque() # list to save the true values
87+
88+
# calculate the true value using rewards returned from the environment
89+
for r in policy.rewards[::-1]:
90+
# calculate the discounted value
5591
R = r + gamma * R
56-
rewards.insert(0, R)
57-
rewards = torch.tensor(rewards)
58-
rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
59-
for (log_prob, value), r in zip(saved_actions, rewards):
60-
reward = r - value.item()
61-
policy_losses.append(-log_prob * reward)
62-
value_losses.append(F.smooth_l1_loss(value, torch.tensor([r])))
92+
returns.appendleft(R)
93+
94+
returns = torch.tensor(returns)
95+
returns = (returns - returns.mean()) / (returns.std() + eps)
96+
97+
for (log_prob, value), R in zip(saved_actions, returns):
98+
advantage = R - value.item()
99+
100+
# calculate actor (policy) loss
101+
policy_losses.append(-log_prob * advantage)
102+
103+
# calculate critic (value) loss using L1 smooth loss
104+
value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))
105+
106+
# reset gradients
63107
optimizer.zero_grad()
108+
109+
# sum up all the values of policy_losses and value_losses
64110
loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
111+
112+
# perform backprop
65113
loss.backward()
66114
optimizer.step()
67-
del model.rewards[:]
68-
del model.saved_actions[:]
115+
# reset rewards and action buffer
116+
del policy.rewards[:]
117+
del policy.saved_actions[:]
69118

70119

71120
EPISODE_STARTED = Events.EPOCH_STARTED
@@ -74,57 +123,63 @@ def finish_episode(model, optimizer, gamma, eps):
74123

75124
def main(env, args):
76125

77-
model = Policy()
78-
optimizer = optim.Adam(model.parameters(), lr=3e-2)
79-
eps = np.finfo(np.float32).eps.item()
80-
timesteps = list(range(10000))
126+
policy = Policy()
127+
optimizer = optim.Adam(policy.parameters(), lr=3e-2)
128+
timesteps = range(10000)
81129

82130
def run_single_timestep(engine, timestep):
83131
observation = engine.state.observation
84-
action = select_action(model, observation)
132+
# select action from policy
133+
action = select_action(policy, observation)
134+
135+
# take the action
85136
engine.state.observation, reward, done, _, _ = env.step(action)
137+
86138
if args.render:
87139
env.render()
88-
model.rewards.append(reward)
89140

141+
policy.rewards.append(reward)
142+
engine.state.ep_reward += reward
90143
if done:
91144
engine.terminate_epoch()
92145
engine.state.timestep = timestep
93146

94147
trainer = Engine(run_single_timestep)
95-
96-
@trainer.on(Events.STARTED)
97-
def initialize(engine):
98-
engine.state.running_reward = 10
148+
trainer.state.running_reward = 10
99149

100150
@trainer.on(EPISODE_STARTED)
101-
def reset_environment_state(engine):
151+
def reset_environment_state():
152+
# reset environment and episode reward
102153
torch.manual_seed(args.seed + trainer.state.epoch)
103-
engine.state.observation, _ = env.reset(seed=args.seed + trainer.state.epoch)
154+
trainer.state.observation, _ = env.reset(seed=args.seed + trainer.state.epoch)
155+
trainer.state.ep_reward = 0
104156

105157
@trainer.on(EPISODE_COMPLETED)
106-
def update_model(engine):
107-
t = engine.state.timestep
108-
engine.state.running_reward = engine.state.running_reward * 0.99 + t * 0.01
109-
finish_episode(model, optimizer, args.gamma, eps)
158+
def update_model():
159+
# update cumulative reward
160+
t = trainer.state.timestep
161+
trainer.state.running_reward = 0.05 * trainer.state.ep_reward + (1 - 0.05) * trainer.state.running_reward
162+
# perform backprop
163+
finish_episode(policy, optimizer, args.gamma)
110164

111165
@trainer.on(EPISODE_COMPLETED(every=args.log_interval))
112-
def log_episode(engine):
113-
i_episode = engine.state.epoch
166+
def log_episode():
167+
i_episode = trainer.state.epoch
114168
print(
115-
f"Episode {i_episode}\tLast length: {engine.state.timestep:5d}"
116-
f"\tAverage length: {engine.state.running_reward:.2f}"
169+
f"Episode {i_episode}\tLast reward: {trainer.state.ep_reward:.2f}"
170+
f"\tAverage reward: {trainer.state.running_reward:.2f}"
117171
)
118172

119173
@trainer.on(EPISODE_COMPLETED)
120-
def should_finish_training(engine):
121-
running_reward = engine.state.running_reward
174+
def should_finish_training():
175+
# check if we have "solved" the cart pole problem
176+
running_reward = trainer.state.running_reward
122177
if running_reward > env.spec.reward_threshold:
123178
print(
124179
f"Solved! Running reward is now {running_reward} and "
125-
f"the last episode runs to {engine.state.timestep} time steps!"
180+
f"the last episode runs to {trainer.state.timestep} time steps!"
126181
)
127-
engine.should_terminate = True
182+
trainer.should_terminate = True
128183

129184
trainer.run(timesteps, max_epochs=args.max_episodes)
130185

examples/reinforcement_learning/reinforce.py

Lines changed: 44 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import argparse
2+
from collections import deque
23

34
import numpy as np
45
import torch
@@ -10,52 +11,58 @@
1011
from ignite.engine import Engine, Events
1112

1213
try:
13-
import gym
14+
import gymnasium as gym
1415
except ImportError:
15-
raise ModuleNotFoundError("Please install opengym: pip install gym")
16+
raise ModuleNotFoundError("Please install opengym: pip install gymnasium")
17+
18+
19+
eps = np.finfo(np.float32).eps.item()
1620

1721

1822
class Policy(nn.Module):
1923
def __init__(self):
2024
super(Policy, self).__init__()
2125
self.affine1 = nn.Linear(4, 128)
26+
self.dropout = nn.Dropout(p=0.6)
2227
self.affine2 = nn.Linear(128, 2)
2328

2429
self.saved_log_probs = []
2530
self.rewards = []
2631

2732
def forward(self, x):
28-
x = F.relu(self.affine1(x))
33+
x = self.affine1(x)
34+
x = self.dropout(x)
35+
x = F.relu(x)
2936
action_scores = self.affine2(x)
3037
return F.softmax(action_scores, dim=1)
3138

3239

33-
def select_action(model, observation):
40+
def select_action(policy, observation):
3441
state = torch.from_numpy(observation).float().unsqueeze(0)
35-
probs = model(state)
42+
probs = policy(state)
3643
m = Categorical(probs)
3744
action = m.sample()
38-
model.saved_log_probs.append(m.log_prob(action))
45+
policy.saved_log_probs.append(m.log_prob(action))
3946
return action.item()
4047

4148

42-
def finish_episode(model, optimizer, gamma, eps):
49+
def finish_episode(policy, optimizer, gamma):
4350
R = 0
4451
policy_loss = []
45-
rewards = []
46-
for r in model.rewards[::-1]:
52+
returns = deque()
53+
for r in policy.rewards[::-1]:
4754
R = r + gamma * R
48-
rewards.insert(0, R)
49-
rewards = torch.tensor(rewards)
50-
rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
51-
for log_prob, reward in zip(model.saved_log_probs, rewards):
52-
policy_loss.append(-log_prob * reward)
55+
returns.appendleft(R)
56+
returns = torch.tensor(returns)
57+
returns = (returns - returns.mean()) / (returns.std() + eps)
58+
for log_prob, R in zip(policy.saved_log_probs, returns):
59+
policy_loss.append(-log_prob * R)
5360
optimizer.zero_grad()
5461
policy_loss = torch.cat(policy_loss).sum()
5562
policy_loss.backward()
5663
optimizer.step()
57-
del model.rewards[:]
58-
del model.saved_log_probs[:]
64+
del policy.rewards[:]
65+
del policy.saved_log_probs[:]
5966

6067

6168
EPISODE_STARTED = Events.EPOCH_STARTED
@@ -64,57 +71,53 @@ def finish_episode(model, optimizer, gamma, eps):
6471

6572
def main(env, args):
6673

67-
model = Policy()
68-
optimizer = optim.Adam(model.parameters(), lr=1e-2)
69-
eps = np.finfo(np.float32).eps.item()
70-
timesteps = list(range(10000))
74+
policy = Policy()
75+
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
76+
timesteps = range(10000)
7177

7278
def run_single_timestep(engine, timestep):
7379
observation = engine.state.observation
74-
action = select_action(model, observation)
80+
action = select_action(policy, observation)
7581
engine.state.observation, reward, done, _, _ = env.step(action)
7682
if args.render:
7783
env.render()
78-
model.rewards.append(reward)
79-
84+
policy.rewards.append(reward)
85+
engine.state.ep_reward += reward
8086
if done:
8187
engine.terminate_epoch()
8288
engine.state.timestep = timestep
8389

8490
trainer = Engine(run_single_timestep)
85-
86-
@trainer.on(Events.STARTED)
87-
def initialize(engine):
88-
engine.state.running_reward = 10
91+
trainer.state.running_reward = 10
8992

9093
@trainer.on(EPISODE_STARTED)
91-
def reset_environment_state(engine):
94+
def reset_environment_state():
9295
torch.manual_seed(args.seed + trainer.state.epoch)
93-
engine.state.observation, _ = env.reset(seed=args.seed + trainer.state.epoch)
96+
trainer.state.observation, _ = env.reset(seed=args.seed + trainer.state.epoch)
97+
trainer.state.ep_reward = 0
9498

9599
@trainer.on(EPISODE_COMPLETED)
96-
def update_model(engine):
97-
t = engine.state.timestep
98-
engine.state.running_reward = engine.state.running_reward * 0.99 + t * 0.01
99-
finish_episode(model, optimizer, args.gamma, eps)
100+
def update_model():
101+
trainer.state.running_reward = 0.05 * trainer.state.ep_reward + (1 - 0.05) * trainer.state.running_reward
102+
finish_episode(policy, optimizer, args.gamma)
100103

101104
@trainer.on(EPISODE_COMPLETED(every=args.log_interval))
102-
def log_episode(engine):
103-
i_episode = engine.state.epoch
105+
def log_episode():
106+
i_episode = trainer.state.epoch
104107
print(
105-
f"Episode {i_episode}\tLast length: {engine.state.timestep:5d}"
106-
f"\tAverage length: {engine.state.running_reward:.2f}"
108+
f"Episode {i_episode}\tLast reward: {trainer.state.ep_reward:.2f}"
109+
f"\tAverage length: {trainer.state.running_reward:.2f}"
107110
)
108111

109112
@trainer.on(EPISODE_COMPLETED)
110-
def should_finish_training(engine):
111-
running_reward = engine.state.running_reward
113+
def should_finish_training():
114+
running_reward = trainer.state.running_reward
112115
if running_reward > env.spec.reward_threshold:
113116
print(
114117
f"Solved! Running reward is now {running_reward} and "
115-
f"the last episode runs to {engine.state.timestep} time steps!"
118+
f"the last episode runs to {trainer.state.timestep} time steps!"
116119
)
117-
engine.should_terminate = True
120+
trainer.should_terminate = True
118121

119122
trainer.run(timesteps, max_epochs=args.max_episodes)
120123

0 commit comments

Comments
 (0)