Skip to content

Commit b7e6353

Browse files
committed
optimize ppo continuous
1 parent 1b243b9 commit b7e6353

File tree

1 file changed

+22
-22
lines changed

1 file changed

+22
-22
lines changed

ppo_gae_continuous.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@
99

1010
#Hyperparameters
1111
learning_rate = 1e-4
12-
gamma = 0.98
12+
gamma = 0.99
1313
lmbda = 0.95
1414
eps_clip = 0.1
1515
batch_size = 4096
16-
K_epoch = 3
16+
K_epoch = 20
1717
T_horizon = 10000
1818

1919
class NormalizedActions(gym.ActionWrapper):
@@ -50,6 +50,8 @@ def __init__(self, num_inputs, num_actions, hidden_size, action_range = 1.):
5050

5151
self.mean_linear = nn.Linear(hidden_size, num_actions)
5252
self.log_std_linear = nn.Linear(hidden_size, num_actions)
53+
# self.log_std_param = nn.Parameter(torch.zeros(num_actions))
54+
5355
self.v_linear = nn.Linear(hidden_size, 1)
5456

5557
self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
@@ -63,6 +65,7 @@ def pi(self, x):
6365

6466
mean = F.tanh(self.mean_linear(x1))
6567
log_std = self.log_std_linear(x2)
68+
# log_std = self.log_std_param.expand_as(mean)
6669

6770
return mean, log_std
6871

@@ -129,27 +132,24 @@ def train_net(self):
129132
s, a, r, s_prime, done_mask, prob_a = self.make_batch()
130133
done_mask_ = torch.flip(done_mask, dims=(0,))
131134

132-
for i in range(K_epoch):
133-
td_target = r + gamma * self.v(s_prime) * done_mask
134-
delta = td_target - self.v(s)
135-
delta = delta.detach().numpy()
136-
137-
advantage_lst = []
138-
advantage = 0.0
139-
for delta_t, mask in zip(delta[::-1], done_mask_):
140-
advantage = gamma * lmbda * advantage * mask + delta_t[0]
141-
advantage_lst.append([advantage])
142-
advantage_lst.reverse()
143-
advantage = torch.tensor(advantage_lst, dtype=torch.float)
144-
if not np.isnan(advantage.std()):
145-
advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-5)
146-
135+
# put target value computation before epoch update reduce computation and stabilize training
136+
td_target = r + gamma * self.v(s_prime) * done_mask
137+
delta = td_target - self.v(s)
138+
delta = delta.detach().numpy()
139+
140+
advantage_lst = []
141+
advantage = 0.0
142+
for delta_t, mask in zip(delta[::-1], done_mask_):
143+
advantage = gamma * lmbda * advantage * mask + delta_t[0]
144+
advantage_lst.append([advantage])
145+
advantage_lst.reverse()
146+
advantage = torch.tensor(advantage_lst, dtype=torch.float)
147+
if not np.isnan(advantage.std()):
148+
advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-5)
149+
150+
for i in range(K_epoch):
147151
mean, log_std = self.pi(s)
148-
try:
149-
log_pi_a = self.get_log_prob(mean, log_std, a)
150-
except:
151-
print(s, a)
152-
print(mean, log_std)
152+
log_pi_a = self.get_log_prob(mean, log_std, a)
153153
# pi = self.pi(s, softmax_dim=1)
154154
# pi_a = pi.gather(1,a)
155155
ratio = torch.exp(log_pi_a - torch.log(prob_a)) # a/b == exp(log(a)-log(b))

0 commit comments

Comments
 (0)