55
55
56
56
##################### hyper parameters ####################
57
57
58
- ENV_NAME = 'HalfCheetah-v2 ' # environment name HalfCheetah-v2 Pendulum-v0
58
+ ENV_NAME = 'Pendulum-v0 ' # environment name HalfCheetah-v2 Pendulum-v0
59
59
RANDOMSEED = 2 # random seed
60
60
61
- EP_MAX = 1000 # total number of episodes for training
62
- EP_LEN = 200 # total number of steps for each episode
63
- GAMMA = 0.9 # reward discount
61
+ EP_MAX = 10000 # total number of episodes for training
62
+ EP_LEN = 1000 # total number of steps for each episode
63
+ GAMMA = 0.99 # reward discount
64
64
A_LR = 0.0001 # learning rate for actor
65
65
C_LR = 0.0002 # learning rate for critic
66
- BATCH = 128 # update batchsize
67
- A_UPDATE_STEPS = 10 # actor update steps
68
- C_UPDATE_STEPS = 10 # critic update steps
66
+ BATCH = 1024 # update batchsize
67
+ A_UPDATE_STEPS = 50 # actor update steps
68
+ C_UPDATE_STEPS = 50 # critic update steps
69
69
EPS = 1e-8 # numerical residual
70
70
METHOD = [
71
71
dict (name = 'kl_pen' , kl_target = 0.01 , lam = 0.5 ), # KL penalty
@@ -273,9 +273,8 @@ def update(self, s, a, r):
273
273
a = torch .FloatTensor (a ).to (device )
274
274
r = torch .FloatTensor (r ).to (device )
275
275
276
- self .update_old_pi ()
277
276
adv = self .cal_adv (s , r )
278
- # adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful, not always, minus mean is dangerous
277
+ adv = (adv - adv .mean ())/ (adv .std ()+ 1e-6 ) # sometimes helpful, not always, minus mean is dangerous
279
278
280
279
# update actor
281
280
if METHOD ['name' ] == 'kl_pen' :
@@ -296,7 +295,10 @@ def update(self, s, a, r):
296
295
297
296
# update critic
298
297
for _ in range (C_UPDATE_STEPS ):
299
- self .c_train (r , s )
298
+ self .c_train (r , s )
299
+
300
+ self .update_old_pi ()
301
+
300
302
301
303
def choose_action (self , s , deterministic = False ):
302
304
'''
@@ -337,7 +339,8 @@ def load_model(self, path):
337
339
338
340
def main ():
339
341
340
- env = NormalizedActions (gym .make (ENV_NAME ).unwrapped )
342
+ # env = NormalizedActions(gym.make(ENV_NAME).unwrapped)
343
+ env = gym .make (ENV_NAME )
341
344
state_dim = env .observation_space .shape [0 ]
342
345
action_dim = env .action_space .shape [0 ]
343
346
@@ -350,13 +353,15 @@ def main():
350
353
351
354
if args .train :
352
355
all_ep_r = []
356
+ buffer = {
357
+ 'state' :[],
358
+ 'action' :[],
359
+ 'reward' :[],
360
+ 'done' :[]
361
+ }
353
362
for ep in range (EP_MAX ):
354
363
s = env .reset ()
355
- buffer = {
356
- 'state' :[],
357
- 'action' :[],
358
- 'reward' :[]
359
- }
364
+
360
365
ep_r = 0
361
366
t0 = time .time ()
362
367
for t in range (EP_LEN ): # in one episode
@@ -366,24 +371,25 @@ def main():
366
371
buffer ['state' ].append (s )
367
372
buffer ['action' ].append (a )
368
373
buffer ['reward' ].append (r )
369
- # buffer['reward '].append((r + 8) / 8) # normalize reward, find to be useful sometimes
374
+ buffer ['done ' ].append (done )
370
375
s = s_
371
376
ep_r += r
372
377
373
378
# update ppo
374
- if (t + 1 ) % BATCH == 0 or t == EP_LEN - 1 or done :
379
+ # if (t + 1) % BATCH == 0 or t == EP_LEN - 1 or done:
380
+ if (t + 1 ) % BATCH == 0 :
375
381
if done :
376
382
v_s_ = 0
377
383
else :
378
384
v_s_ = ppo .get_v (s_ )[0 ]
379
385
discounted_r = []
380
- for r in buffer ['reward' ][::- 1 ]:
381
- v_s_ = r + GAMMA * v_s_
386
+ for r , d in zip ( buffer ['reward' ][::- 1 ], buffer [ 'done' ][:: - 1 ]) :
387
+ v_s_ = r + GAMMA * v_s_ * ( 1 - d )
382
388
discounted_r .append (v_s_ )
383
389
discounted_r .reverse ()
384
390
385
391
bs , ba , br = np .vstack (buffer ['state' ]), np .vstack (buffer ['action' ]), np .array (discounted_r )[:, np .newaxis ]
386
- buffer ['state' ], buffer ['action' ], buffer ['reward' ] = [], [], []
392
+ buffer ['state' ], buffer ['action' ], buffer ['reward' ], buffer [ 'done' ] = [], [], [], []
387
393
ppo .update (bs , ba , br )
388
394
389
395
if done :
0 commit comments