Skip to content

Commit 6790257

Browse files
committed
2 parents 7879a34 + 46fbb7a commit 6790257

File tree

1 file changed

+27
-21
lines changed

1 file changed

+27
-21
lines changed

ppo_continuous.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -55,17 +55,17 @@
5555

5656
##################### hyper parameters ####################
5757

58-
ENV_NAME = 'HalfCheetah-v2' # environment name HalfCheetah-v2 Pendulum-v0
58+
ENV_NAME = 'Pendulum-v0' # environment name HalfCheetah-v2 Pendulum-v0
5959
RANDOMSEED = 2 # random seed
6060

61-
EP_MAX = 1000 # total number of episodes for training
62-
EP_LEN = 200 # total number of steps for each episode
63-
GAMMA = 0.9 # reward discount
61+
EP_MAX = 10000 # total number of episodes for training
62+
EP_LEN = 1000 # total number of steps for each episode
63+
GAMMA = 0.99 # reward discount
6464
A_LR = 0.0001 # learning rate for actor
6565
C_LR = 0.0002 # learning rate for critic
66-
BATCH = 128 # update batchsize
67-
A_UPDATE_STEPS = 10 # actor update steps
68-
C_UPDATE_STEPS = 10 # critic update steps
66+
BATCH = 1024 # update batchsize
67+
A_UPDATE_STEPS = 50 # actor update steps
68+
C_UPDATE_STEPS = 50 # critic update steps
6969
EPS = 1e-8 # numerical residual
7070
METHOD = [
7171
dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty
@@ -273,9 +273,8 @@ def update(self, s, a, r):
273273
a = torch.FloatTensor(a).to(device)
274274
r = torch.FloatTensor(r).to(device)
275275

276-
self.update_old_pi()
277276
adv = self.cal_adv(s, r)
278-
# adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful, not always, minus mean is dangerous
277+
adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful, not always, minus mean is dangerous
279278

280279
# update actor
281280
if METHOD['name'] == 'kl_pen':
@@ -296,7 +295,10 @@ def update(self, s, a, r):
296295

297296
# update critic
298297
for _ in range(C_UPDATE_STEPS):
299-
self.c_train(r, s)
298+
self.c_train(r, s)
299+
300+
self.update_old_pi()
301+
300302

301303
def choose_action(self, s, deterministic=False):
302304
'''
@@ -337,7 +339,8 @@ def load_model(self, path):
337339

338340
def main():
339341

340-
env = NormalizedActions(gym.make(ENV_NAME).unwrapped)
342+
# env = NormalizedActions(gym.make(ENV_NAME).unwrapped)
343+
env = gym.make(ENV_NAME)
341344
state_dim = env.observation_space.shape[0]
342345
action_dim = env.action_space.shape[0]
343346

@@ -350,13 +353,15 @@ def main():
350353

351354
if args.train:
352355
all_ep_r = []
356+
buffer={
357+
'state':[],
358+
'action':[],
359+
'reward':[],
360+
'done':[]
361+
}
353362
for ep in range(EP_MAX):
354363
s = env.reset()
355-
buffer={
356-
'state':[],
357-
'action':[],
358-
'reward':[]
359-
}
364+
360365
ep_r = 0
361366
t0 = time.time()
362367
for t in range(EP_LEN): # in one episode
@@ -366,24 +371,25 @@ def main():
366371
buffer['state'].append(s)
367372
buffer['action'].append(a)
368373
buffer['reward'].append(r)
369-
# buffer['reward'].append((r + 8) / 8) # normalize reward, find to be useful sometimes
374+
buffer['done'].append(done)
370375
s = s_
371376
ep_r += r
372377

373378
# update ppo
374-
if (t + 1) % BATCH == 0 or t == EP_LEN - 1 or done:
379+
# if (t + 1) % BATCH == 0 or t == EP_LEN - 1 or done:
380+
if (t + 1) % BATCH == 0:
375381
if done:
376382
v_s_=0
377383
else:
378384
v_s_ = ppo.get_v(s_)[0]
379385
discounted_r = []
380-
for r in buffer['reward'][::-1]:
381-
v_s_ = r + GAMMA * v_s_
386+
for r, d in zip(buffer['reward'][::-1], buffer['done'][::-1]):
387+
v_s_ = r + GAMMA * v_s_ * (1-d)
382388
discounted_r.append(v_s_)
383389
discounted_r.reverse()
384390

385391
bs, ba, br = np.vstack(buffer['state']), np.vstack(buffer['action']), np.array(discounted_r)[:, np.newaxis]
386-
buffer['state'], buffer['action'], buffer['reward'] = [], [], []
392+
buffer['state'], buffer['action'], buffer['reward'], buffer['done'] = [], [], [], []
387393
ppo.update(bs, ba, br)
388394

389395
if done:

0 commit comments

Comments
 (0)