Skip to content

Commit 94dd3f5

Browse files
authored
Merge branch 'master' into ChrisWu1997-patch-1
2 parents d54820a + 8ee2bf8 commit 94dd3f5

File tree

4 files changed

+33
-30
lines changed

4 files changed

+33
-30
lines changed

examples/reinforcement_learning/tutorial_AC.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,11 @@
4646
4747
"""
4848
import argparse
49-
import os
5049
import time
50+
import matplotlib.pyplot as plt
51+
import os
5152

5253
import gym
53-
import matplotlib.pyplot as plt
5454
import numpy as np
5555
import tensorflow as tf
5656

@@ -78,6 +78,8 @@
7878
LR_A = 0.001 # learning rate for actor
7979
LR_C = 0.01 # learning rate for critic
8080

81+
82+
8183
############################### Actor-Critic ####################################
8284

8385

@@ -137,12 +139,13 @@ def __init__(self, state_dim, lr=0.01):
137139

138140
self.optimizer = tf.optimizers.Adam(lr)
139141

140-
def learn(self, state, reward, state_):
142+
def learn(self, state, reward, state_, done):
143+
d = 0 if done else 1
141144
v_ = self.model(np.array([state_]))
142145
with tf.GradientTape() as tape:
143146
v = self.model(np.array([state]))
144-
## TD_error = r + lambda * V(newS) - V(S)
145-
td_error = reward + LAM * v_ - v
147+
## TD_error = r + d * lambda * V(newS) - V(S)
148+
td_error = reward + d * LAM * v_ - v
146149
loss = tf.square(td_error)
147150
grad = tape.gradient(loss, self.model.trainable_weights)
148151
self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
@@ -203,7 +206,7 @@ def load(self): # load trained weights
203206
state_new, reward, done, info = env.step(action)
204207
state_new = state_new.astype(np.float32)
205208

206-
if done: reward = -20 # reward shaping trick
209+
if done: reward = -20 # reward shaping trick
207210
# these may helpful in some tasks
208211
# if abs(s_new[0]) >= env.observation_space.high[0]:
209212
# # cart moves more than 2.4 units from the center
@@ -215,7 +218,7 @@ def load(self): # load trained weights
215218

216219
try:
217220
td_error = critic.learn(
218-
state, reward, state_new
221+
state, reward, state_new, done
219222
) # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)]
220223
actor.learn(state, action, td_error) # learn Policy : true_gradient = grad[logPi(s, a) * td_error]
221224
except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn()
@@ -238,7 +241,7 @@ def load(self): # load trained weights
238241

239242
# Early Stopping for quick check
240243
if step >= MAX_STEPS:
241-
print("Early Stopping") # Hao Dong: it is important for this task
244+
print("Early Stopping") # Hao Dong: it is important for this task
242245
break
243246
actor.save()
244247
critic.save()

examples/reinforcement_learning/tutorial_DPPO.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@
3737
import matplotlib.pyplot as plt
3838
import numpy as np
3939
import tensorflow as tf
40-
4140
import tensorflow_probability as tfp
41+
4242
import tensorlayer as tl
4343

4444
parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
@@ -73,6 +73,7 @@
7373
# ppo-clip parameters
7474
EPSILON = 0.2
7575

76+
7677
############################### DPPO ####################################
7778

7879

@@ -282,7 +283,10 @@ def work(self):
282283
GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers
283284
if t == MAX_STEPS - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
284285
# finish patyh
285-
v_s_ = self.ppo.critic(np.array([s_], np.float32))[0][0]
286+
if done:
287+
v_s_ = 0
288+
else:
289+
v_s_ = self.ppo.critic(np.array([s_], np.float32))[0][0]
286290
discounted_r = [] # compute discounted reward
287291
for r in buffer_r[::-1]:
288292
v_s_ = r + GAMMA * v_s_
@@ -304,8 +308,7 @@ def work(self):
304308

305309
print(
306310
'Training | Episode: {}/{} | Worker: {} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
307-
GLOBAL_EP + 1, TRAIN_EPISODES, self.wid, ep_r,
308-
time.time() - T0
311+
GLOBAL_EP + 1, TRAIN_EPISODES, self.wid, ep_r, time.time() - T0
309312
)
310313
)
311314
# record reward changes, plot later
@@ -372,6 +375,4 @@ def work(self):
372375
print(
373376
'Testing | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
374377
episode + 1, TEST_EPISODES, episode_reward,
375-
time.time() - T0
376-
)
377-
)
378+
time.time() - T0))

examples/reinforcement_learning/tutorial_PPO.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@
3030
import matplotlib.pyplot as plt
3131
import numpy as np
3232
import tensorflow as tf
33-
3433
import tensorflow_probability as tfp
34+
3535
import tensorlayer as tl
3636

3737
parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
@@ -63,14 +63,14 @@
6363
# ppo-clip parameters
6464
EPSILON = 0.2
6565

66+
6667
############################### PPO ####################################
6768

6869

6970
class PPO(object):
7071
"""
7172
PPO class
7273
"""
73-
7474
def __init__(self, state_dim, action_dim, action_bound, method='clip'):
7575
# critic
7676
with tf.name_scope('critic'):
@@ -233,13 +233,16 @@ def store_transition(self, state, action, reward):
233233
self.action_buffer.append(action)
234234
self.reward_buffer.append(reward)
235235

236-
def finish_path(self, next_state):
236+
def finish_path(self, next_state, done):
237237
"""
238238
Calculate cumulative reward
239239
:param next_state:
240240
:return: None
241241
"""
242-
v_s_ = self.critic(np.array([next_state], np.float32))[0, 0]
242+
if done:
243+
v_s_ = 0
244+
else:
245+
v_s_ = self.critic(np.array([next_state], np.float32))[0, 0]
243246
discounted_r = []
244247
for r in self.reward_buffer[::-1]:
245248
v_s_ = r + GAMMA * v_s_
@@ -280,17 +283,15 @@ def finish_path(self, next_state):
280283
episode_reward += reward
281284

282285
# update ppo
283-
if (step + 1) % BATCH_SIZE == 0:
284-
agent.finish_path(state_)
286+
if len(agent.state_buffer) >= BATCH_SIZE:
287+
agent.finish_path(state_, done)
285288
agent.update()
286289
if done:
287290
break
288-
agent.finish_path(state_)
291+
agent.finish_path(state_, done)
289292
print(
290293
'Training | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
291-
episode + 1, TRAIN_EPISODES, episode_reward,
292-
time.time() - t0
293-
)
294+
episode + 1, TRAIN_EPISODES, episode_reward, time.time() - t0)
294295
)
295296
if episode == 0:
296297
all_episode_reward.append(episode_reward)
@@ -318,6 +319,4 @@ def finish_path(self, next_state):
318319
print(
319320
'Testing | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
320321
episode + 1, TEST_EPISODES, episode_reward,
321-
time.time() - t0
322-
)
323-
)
322+
time.time() - t0))

examples/reinforcement_learning/tutorial_SAC.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def evaluate(self, state, epsilon=1e-6):
185185
std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow
186186

187187
normal = Normal(0, 1)
188-
z = normal.sample()
188+
z = normal.sample(mean.shape)
189189
action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick
190190
action = self.action_range * action_0
191191
# according to original paper, with an extra last term for normalizing different action range
@@ -204,7 +204,7 @@ def get_action(self, state, greedy=False):
204204
std = tf.math.exp(log_std)
205205

206206
normal = Normal(0, 1)
207-
z = normal.sample()
207+
z = normal.sample(mean.shape)
208208
action = self.action_range * tf.math.tanh(
209209
mean + std * z
210210
) # TanhNormal distribution as actions; reparameterization trick

0 commit comments

Comments
 (0)