11import argparse
2- from collections import namedtuple
2+ from collections import deque , namedtuple
33
44import numpy as np
55import torch
1111from ignite .engine import Engine , Events
1212
1313try :
14- import gym
14+ import gymnasium as gym
1515except ImportError :
16- raise ModuleNotFoundError ("Please install opengym: pip install gym " )
16+ raise ModuleNotFoundError ("Please install opengym: pip install gymnasium " )
1717
1818
1919SavedAction = namedtuple ("SavedAction" , ["log_prob" , "value" ])
2020
21+ eps = np .finfo (np .float32 ).eps .item ()
22+
2123
2224class Policy (nn .Module ):
25+ """
26+ implements both actor and critic in one model
27+ """
28+
2329 def __init__ (self ):
2430 super (Policy , self ).__init__ ()
2531 self .affine1 = nn .Linear (4 , 128 )
32+
33+ # actor's layer
2634 self .action_head = nn .Linear (128 , 2 )
35+
36+ # critic's layer
2737 self .value_head = nn .Linear (128 , 1 )
2838
39+ # action & reward buffer
2940 self .saved_actions = []
3041 self .rewards = []
3142
3243 def forward (self , x ):
44+ """
45+ forward of both actor and critic
46+ """
3347 x = F .relu (self .affine1 (x ))
34- action_scores = self .action_head (x )
48+
49+ # actor: choses action to take from state s_t
50+ # by returning probability of each action
51+ action_prob = F .softmax (self .action_head (x ), dim = - 1 )
52+
53+ # critic: evaluates being in the state s_t
3554 state_values = self .value_head (x )
36- return F .softmax (action_scores , dim = - 1 ), state_values
55+
56+ # return values for both actor and critic as a tuple of 2 values:
57+ # 1. a list with the probability of each action over the action space
58+ # 2. the value from state s_t
59+ return action_prob , state_values
3760
3861
39- def select_action (model , observation ):
62+ def select_action (policy , observation ):
4063 observation = torch .from_numpy (observation ).float ()
41- probs , observation_value = model (observation )
64+ probs , observation_value = policy (observation )
65+ # create a categorical distribution over the list of probabilities of actions
4266 m = Categorical (probs )
67+
68+ # and sample an action using the distribution
4369 action = m .sample ()
44- model .saved_actions .append (SavedAction (m .log_prob (action ), observation_value ))
70+
71+ # save to action buffer
72+ policy .saved_actions .append (SavedAction (m .log_prob (action ), observation_value ))
73+
74+ # the action to take (left or right)
4575 return action .item ()
4676
4777
48- def finish_episode (model , optimizer , gamma , eps ):
78+ def finish_episode (policy , optimizer , gamma ):
79+ """
80+ Training code. Calculates actor and critic loss and performs backprop.
81+ """
4982 R = 0
50- saved_actions = model .saved_actions
51- policy_losses = []
52- value_losses = []
53- rewards = []
54- for r in model .rewards [::- 1 ]:
83+ saved_actions = policy .saved_actions
84+ policy_losses = [] # list to save actor (policy) loss
85+ value_losses = [] # list to save critic (value) loss
86+ returns = deque () # list to save the true values
87+
88+ # calculate the true value using rewards returned from the environment
89+ for r in policy .rewards [::- 1 ]:
90+ # calculate the discounted value
5591 R = r + gamma * R
56- rewards .insert (0 , R )
57- rewards = torch .tensor (rewards )
58- rewards = (rewards - rewards .mean ()) / (rewards .std () + eps )
59- for (log_prob , value ), r in zip (saved_actions , rewards ):
60- reward = r - value .item ()
61- policy_losses .append (- log_prob * reward )
62- value_losses .append (F .smooth_l1_loss (value , torch .tensor ([r ])))
92+ returns .appendleft (R )
93+
94+ returns = torch .tensor (returns )
95+ returns = (returns - returns .mean ()) / (returns .std () + eps )
96+
97+ for (log_prob , value ), R in zip (saved_actions , returns ):
98+ advantage = R - value .item ()
99+
100+ # calculate actor (policy) loss
101+ policy_losses .append (- log_prob * advantage )
102+
103+ # calculate critic (value) loss using L1 smooth loss
104+ value_losses .append (F .smooth_l1_loss (value , torch .tensor ([R ])))
105+
106+ # reset gradients
63107 optimizer .zero_grad ()
108+
109+ # sum up all the values of policy_losses and value_losses
64110 loss = torch .stack (policy_losses ).sum () + torch .stack (value_losses ).sum ()
111+
112+ # perform backprop
65113 loss .backward ()
66114 optimizer .step ()
67- del model .rewards [:]
68- del model .saved_actions [:]
115+ # reset rewards and action buffer
116+ del policy .rewards [:]
117+ del policy .saved_actions [:]
69118
70119
71120EPISODE_STARTED = Events .EPOCH_STARTED
@@ -74,57 +123,63 @@ def finish_episode(model, optimizer, gamma, eps):
74123
75124def main (env , args ):
76125
77- model = Policy ()
78- optimizer = optim .Adam (model .parameters (), lr = 3e-2 )
79- eps = np .finfo (np .float32 ).eps .item ()
80- timesteps = list (range (10000 ))
126+ policy = Policy ()
127+ optimizer = optim .Adam (policy .parameters (), lr = 3e-2 )
128+ timesteps = range (10000 )
81129
82130 def run_single_timestep (engine , timestep ):
83131 observation = engine .state .observation
84- action = select_action (model , observation )
132+ # select action from policy
133+ action = select_action (policy , observation )
134+
135+ # take the action
85136 engine .state .observation , reward , done , _ , _ = env .step (action )
137+
86138 if args .render :
87139 env .render ()
88- model .rewards .append (reward )
89140
141+ policy .rewards .append (reward )
142+ engine .state .ep_reward += reward
90143 if done :
91144 engine .terminate_epoch ()
92145 engine .state .timestep = timestep
93146
94147 trainer = Engine (run_single_timestep )
95-
96- @trainer .on (Events .STARTED )
97- def initialize (engine ):
98- engine .state .running_reward = 10
148+ trainer .state .running_reward = 10
99149
100150 @trainer .on (EPISODE_STARTED )
101- def reset_environment_state (engine ):
151+ def reset_environment_state ():
152+ # reset environment and episode reward
102153 torch .manual_seed (args .seed + trainer .state .epoch )
103- engine .state .observation , _ = env .reset (seed = args .seed + trainer .state .epoch )
154+ trainer .state .observation , _ = env .reset (seed = args .seed + trainer .state .epoch )
155+ trainer .state .ep_reward = 0
104156
105157 @trainer .on (EPISODE_COMPLETED )
106- def update_model (engine ):
107- t = engine .state .timestep
108- engine .state .running_reward = engine .state .running_reward * 0.99 + t * 0.01
109- finish_episode (model , optimizer , args .gamma , eps )
158+ def update_model ():
159+ # update cumulative reward
160+ t = trainer .state .timestep
161+ trainer .state .running_reward = 0.05 * trainer .state .ep_reward + (1 - 0.05 ) * trainer .state .running_reward
162+ # perform backprop
163+ finish_episode (policy , optimizer , args .gamma )
110164
111165 @trainer .on (EPISODE_COMPLETED (every = args .log_interval ))
112- def log_episode (engine ):
113- i_episode = engine .state .epoch
166+ def log_episode ():
167+ i_episode = trainer .state .epoch
114168 print (
115- f"Episode { i_episode } \t Last length : { engine .state .timestep :5d } "
116- f"\t Average length : { engine .state .running_reward :.2f} "
169+ f"Episode { i_episode } \t Last reward : { trainer .state .ep_reward :.2f } "
170+ f"\t Average reward : { trainer .state .running_reward :.2f} "
117171 )
118172
119173 @trainer .on (EPISODE_COMPLETED )
120- def should_finish_training (engine ):
121- running_reward = engine .state .running_reward
174+ def should_finish_training ():
175+ # check if we have "solved" the cart pole problem
176+ running_reward = trainer .state .running_reward
122177 if running_reward > env .spec .reward_threshold :
123178 print (
124179 f"Solved! Running reward is now { running_reward } and "
125- f"the last episode runs to { engine .state .timestep } time steps!"
180+ f"the last episode runs to { trainer .state .timestep } time steps!"
126181 )
127- engine .should_terminate = True
182+ trainer .should_terminate = True
128183
129184 trainer .run (timesteps , max_epochs = args .max_episodes )
130185
0 commit comments