fix

quantumiracle · quantumiracle · commit cb5aea7a277f · 2021-12-11T19:23:16.000-05:00
diff --git a/ppo_continuous.py b/ppo_continuous.py
@@ -151,13 +151,13 @@ def forward(self, state):
     def get_action(self, state, deterministic=False):
         state = torch.FloatTensor(state).unsqueeze(0).to(device)
         mean, log_std = self.forward(state)
-        std = log_std.exp()
-        normal = Normal(0, 1)
-        z      = normal.sample() 
+
         if deterministic:
             action = mean
         else:
-            action  = mean+std*z
+            std = log_std.exp()
+            normal = Normal(mean, std)
+            action = normal.sample() 
         action = torch.clamp(action, -self.action_range, self.action_range)
         return action.squeeze(0)
 
diff --git a/ppo_continuous3.py b/ppo_continuous3.py
@@ -8,7 +8,8 @@
 * It merge the losses of critic and actor into one update manner, using a single optimizer 
 instead of one for actor and one for critic.
 * It uses the min of clipping value loss and non-clipping value loss.
-* It additionally has a policy entropy bonus in loss (line 145)
+* It additionally has a policy entropy bonus in loss (line 146).
+* It uses MultivariateNormal for policy distribution instead of Normal.
 
 
 To run
diff --git a/ppo_gae_continuous.py b/ppo_gae_continuous.py
@@ -123,7 +123,6 @@ def train_net(self):
         for i in range(K_epoch):
             td_target = r + gamma * self.v(s_prime) * done_mask
             delta = td_target - self.v(s)
-            advantage = delta
             delta = delta.detach().numpy()
 
             advantage_lst = []