@@ -78,13 +78,22 @@ def v(self, x):
78
78
def get_action (self , x ):
79
79
mean , log_std = self .pi (x )
80
80
std = log_std .exp ()
81
- normal = Normal (0 , 1 )
82
- z = normal .sample ()
83
- action = mean + std * z
84
- log_prob = Normal (mean , std ).log_prob (action )
85
- log_prob = log_prob .sum (dim = - 1 , keepdim = True ) # reduce dim
81
+ normal = Normal (mean , std )
82
+ action = normal .sample ()
83
+ log_prob = normal .log_prob (action ).sum (- 1 )
86
84
prob = log_prob .exp ()
87
85
86
+ ## The following way of generating action seems not correct.
87
+ ## All dimensions of action depends on the same hidden variable z.
88
+ ## In some envs like Ant-v2, it may let the agent not fall easity due to the correlation of actions.
89
+ ## But this does not in general holds true, and may cause numerical problem (nan) in update.
90
+ # normal = Normal(0, 1)
91
+ # z = normal.sample()
92
+ # action = mean + std*z
93
+ # log_prob = Normal(mean, std).log_prob(action)
94
+ # log_prob = log_prob.sum(dim=-1, keepdim=True) # reduce dim
95
+ # prob = log_prob.exp()
96
+
88
97
action = self .action_range * action # scale the action
89
98
90
99
return action .detach ().numpy (), prob
@@ -136,7 +145,11 @@ def train_net(self):
136
145
advantage = (advantage - advantage .mean ()) / (advantage .std () + 1e-5 )
137
146
138
147
mean , log_std = self .pi (s )
139
- log_pi_a = self .get_log_prob (mean , log_std , a )
148
+ try :
149
+ log_pi_a = self .get_log_prob (mean , log_std , a )
150
+ except :
151
+ print (s , a )
152
+ print (mean , log_std )
140
153
# pi = self.pi(s, softmax_dim=1)
141
154
# pi_a = pi.gather(1,a)
142
155
ratio = torch .exp (log_pi_a - torch .log (prob_a )) # a/b == exp(log(a)-log(b))
0 commit comments