fix bcnnpg

reiniscimurs · reiniscimurs · commit 9d12c3e7f5f1 · 2025-03-17T11:10:59.000+01:00
diff --git a/robot_nav/models/BPG/BCNNPG.py b/robot_nav/models/BPG/BCNNPG.py
@@ -154,20 +154,20 @@ def act(self, state):
 
     # training cycle
     def train(
-        self,
-        replay_buffer,
-        iterations,
-        batch_size,
-        discount=0.99,
-        tau=0.005,
-        policy_noise=0.2,
-        noise_clip=0.5,
-        policy_freq=2,
-        max_lin_vel=0.5,
-        max_ang_vel=1,
-        goal_reward=100,
-        distance_norm=10,
-        time_step=0.3,
+            self,
+            replay_buffer,
+            iterations,
+            batch_size,
+            discount=0.99,
+            tau=0.005,
+            policy_noise=0.2,
+            noise_clip=0.5,
+            policy_freq=2,
+            max_lin_vel=0.5,
+            max_ang_vel=1,
+            goal_reward=100,
+            distance_norm=10,
+            time_step=0.3,
     ):
         av_Q = 0
         max_b = 0
@@ -203,15 +203,13 @@ def train(
             next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
 
             # Calculate the Q values from the critic-target network for the next state-action pair
-            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
+            target_Q = self.critic_target(next_state, next_action)
 
-            # Select the minimal Q value from the 2 calculated values
-            target_Q = torch.min(target_Q1, target_Q2)
             av_Q += torch.mean(target_Q)
             max_Q = max(max_Q, torch.max(target_Q))
-
             # Calculate the final Q value from the target network parameters by using Bellman equation
             target_Q = reward + ((1 - done) * discount * target_Q).detach()
+
             # Get the Q values of the basis networks with the current parameters
             current_Q = self.critic(state, action)
 
@@ -224,15 +222,18 @@ def train(
                 distance_norm,
                 goal_reward,
                 reward,
+                done,
             )
-            max_b += max(max_b, torch.max(max_bound))
+            max_b = max(max_b, torch.max(max_bound))
             max_bound_loss_Q = current_Q - max_bound
             max_bound_loss_Q[max_bound_loss_Q < 0] = 0
             max_bound_loss_Q = torch.square(max_bound_loss_Q).mean()
+            max_bound_loss = max_bound_loss_Q
 
             # Calculate the loss between the current Q value and the target Q value
             loss_target_Q = F.mse_loss(current_Q, target_Q)
-            max_bound_loss = self.bound_weight * max_bound_loss_Q
+
+            max_bound_loss = self.bound_weight * max_bound_loss
             loss = loss_target_Q + max_bound_loss
             # Perform the gradient descent
             self.critic_optimizer.zero_grad()
@@ -242,7 +243,7 @@ def train(
             if it % policy_freq == 0:
                 # Maximize the actor output value by performing gradient descent on negative Q values
                 # (essentially perform gradient ascent)
-                actor_grad, _ = self.critic(state, self.actor(state))
+                actor_grad = self.critic(state, self.actor(state))
                 actor_grad = -actor_grad.mean()
                 self.actor_optimizer.zero_grad()
                 actor_grad.backward()
@@ -251,15 +252,15 @@ def train(
                 # Use soft update to update the actor-target network parameters by
                 # infusing small amount of current parameters
                 for param, target_param in zip(
-                    self.actor.parameters(), self.actor_target.parameters()
+                        self.actor.parameters(), self.actor_target.parameters()
                 ):
                     target_param.data.copy_(
                         tau * param.data + (1 - tau) * target_param.data
                     )
                 # Use soft update to update the critic-target network parameters by infusing
                 # small amount of current parameters
                 for param, target_param in zip(
-                    self.critic.parameters(), self.critic_target.parameters()
+                        self.critic.parameters(), self.critic_target.parameters()
                 ):
                     target_param.data.copy_(
                         tau * param.data + (1 - tau) * target_param.data
@@ -284,15 +285,16 @@ def train(
             self.save(filename=self.model_name, directory=self.save_directory)
 
     def get_max_bound(
-        self,
-        next_state,
-        discount,
-        max_ang_vel,
-        max_lin_vel,
-        time_step,
-        distance_norm,
-        goal_reward,
-        reward,
+            self,
+            next_state,
+            discount,
+            max_ang_vel,
+            max_lin_vel,
+            time_step,
+            distance_norm,
+            goal_reward,
+            reward,
+            done,
     ):
         cos = next_state[:, -4]
         sin = next_state[:, -3]
@@ -302,7 +304,7 @@ def get_max_bound(
         full_turn_steps = torch.floor(turn_steps.abs())
         turn_rew = [
             (
-                -1 * discount**step * max_ang_vel
+                -1 * discount ** step * max_ang_vel
                 if step
                 else torch.zeros(1, device=self.device)
             )
@@ -324,20 +326,20 @@ def get_max_bound(
         final_steps = torch.ceil(distances) + full_turn_steps
         inter_steps = torch.trunc(distances) + full_turn_steps
         final_discount = torch.tensor(
-            [discount**pw for pw in final_steps], device=self.device
+            [discount ** pw for pw in final_steps], device=self.device
         )
         final_rew = (
-            torch.ones_like(distances, device=self.device)
-            * goal_reward
-            * final_discount
+                torch.ones_like(distances, device=self.device)
+                * goal_reward
+                * final_discount
         )
 
         max_inter_steps = inter_steps.max()
         exponents = torch.arange(
             1, max_inter_steps + 1, dtype=torch.float32, device=self.device
         )
         discount_exponents = torch.tensor(
-            [discount**e for e in exponents], device=self.device
+            [discount ** e for e in exponents], device=self.device
         )
         inter_rew = torch.tensor(
             [
@@ -352,7 +354,7 @@ def get_max_bound(
             device=self.device,
         )
         max_future_rew = full_turn_rew + final_rew + inter_rew
-        max_bound = reward + max_future_rew.view(-1, 1)
+        max_bound = reward + (1 - done) * max_future_rew.view(-1, 1)
         return max_bound
 
     def save(self, filename, directory):
diff --git a/robot_nav/train.py b/robot_nav/train.py
@@ -18,7 +18,7 @@ def main(args=None):
     """Main training function"""
     action_dim = 2  # number of actions produced by the model
     max_action = 1  # maximum absolute value of output actions
-    state_dim = 25  # number of input values in the neural network (vector length of state input)
+    state_dim = 185  # number of input values in the neural network (vector length of state input)
     device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu"
     )  # using cuda if it is available, cpu otherwise