Skip to content

Commit 1b243b9

Browse files
committed
fix bug in ppo_gae_continuous
1 parent 660e9d7 commit 1b243b9

File tree

1 file changed

+19
-6
lines changed

1 file changed

+19
-6
lines changed

ppo_gae_continuous.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,22 @@ def v(self, x):
7878
def get_action(self, x):
7979
mean, log_std = self.pi(x)
8080
std = log_std.exp()
81-
normal = Normal(0, 1)
82-
z = normal.sample()
83-
action = mean + std*z
84-
log_prob = Normal(mean, std).log_prob(action)
85-
log_prob = log_prob.sum(dim=-1, keepdim=True) # reduce dim
81+
normal = Normal(mean, std)
82+
action = normal.sample()
83+
log_prob = normal.log_prob(action).sum(-1)
8684
prob = log_prob.exp()
8785

86+
## The following way of generating action seems not correct.
87+
## All dimensions of action depends on the same hidden variable z.
88+
## In some envs like Ant-v2, it may let the agent not fall easity due to the correlation of actions.
89+
## But this does not in general holds true, and may cause numerical problem (nan) in update.
90+
# normal = Normal(0, 1)
91+
# z = normal.sample()
92+
# action = mean + std*z
93+
# log_prob = Normal(mean, std).log_prob(action)
94+
# log_prob = log_prob.sum(dim=-1, keepdim=True) # reduce dim
95+
# prob = log_prob.exp()
96+
8897
action = self.action_range*action # scale the action
8998

9099
return action.detach().numpy(), prob
@@ -136,7 +145,11 @@ def train_net(self):
136145
advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-5)
137146

138147
mean, log_std = self.pi(s)
139-
log_pi_a = self.get_log_prob(mean, log_std, a)
148+
try:
149+
log_pi_a = self.get_log_prob(mean, log_std, a)
150+
except:
151+
print(s, a)
152+
print(mean, log_std)
140153
# pi = self.pi(s, softmax_dim=1)
141154
# pi_a = pi.gather(1,a)
142155
ratio = torch.exp(log_pi_a - torch.log(prob_a)) # a/b == exp(log(a)-log(b))

0 commit comments

Comments
 (0)