Add random testing environment

reiniscimurs · reiniscimurs · commit 38bd0b747ae6 · 2025-03-31T22:45:22.000+02:00
diff --git a/robot_nav/eval_world.yaml b/robot_nav/eval_world.yaml
@@ -3,7 +3,7 @@ world:
   width: 10   # the height of the world
   step_time: 0.3  # Hz calculate each step
   sample_time: 0.3  # Hz for render and data extraction
-  collision_mode: 'react'
+  collision_mode: 'reactive'
 
 robot:
   - kinematics: {name: 'diff'}
diff --git a/robot_nav/models/BPG/BCNNPG.py b/robot_nav/models/BPG/BCNNPG.py
@@ -154,22 +154,23 @@ def act(self, state):
 
     # training cycle
     def train(
-            self,
-            replay_buffer,
-            iterations,
-            batch_size,
-            discount=0.99,
-            tau=0.005,
-            policy_noise=0.2,
-            noise_clip=0.5,
-            policy_freq=2,
-            max_lin_vel=0.5,
-            max_ang_vel=1,
-            goal_reward=100,
-            distance_norm=10,
-            time_step=0.3,
+        self,
+        replay_buffer,
+        iterations,
+        batch_size,
+        discount=0.99,
+        tau=0.005,
+        policy_noise=0.2,
+        noise_clip=0.5,
+        policy_freq=2,
+        max_lin_vel=0.5,
+        max_ang_vel=1,
+        goal_reward=100,
+        distance_norm=10,
+        time_step=0.3,
     ):
         av_Q = 0
+        av_bound = 0
         max_b = 0
         max_Q = -inf
         av_loss = 0
@@ -225,11 +226,10 @@ def train(
                 done,
             )
             max_b = max(max_b, torch.max(max_bound))
-            max_bound_loss_Q = current_Q - max_bound
-            max_bound_loss_Q[max_bound_loss_Q < 0] = 0
-            max_bound_loss_Q = torch.square(max_bound_loss_Q).mean()
-            max_bound_loss = max_bound_loss_Q
+            av_bound += torch.mean(max_bound)
 
+            max_bound_Q = torch.min(current_Q, max_bound)
+            max_bound_loss = F.mse_loss(current_Q, max_bound_Q)
             # Calculate the loss between the current Q value and the target Q value
             loss_target_Q = F.mse_loss(current_Q, target_Q)
 
@@ -244,6 +244,7 @@ def train(
                 # Maximize the actor output value by performing gradient descent on negative Q values
                 # (essentially perform gradient ascent)
                 actor_grad = self.critic(state, self.actor(state))
+                actor_grad = torch.min(actor_grad, max_bound)
                 actor_grad = -actor_grad.mean()
                 self.actor_optimizer.zero_grad()
                 actor_grad.backward()
@@ -252,15 +253,15 @@ def train(
                 # Use soft update to update the actor-target network parameters by
                 # infusing small amount of current parameters
                 for param, target_param in zip(
-                        self.actor.parameters(), self.actor_target.parameters()
+                    self.actor.parameters(), self.actor_target.parameters()
                 ):
                     target_param.data.copy_(
                         tau * param.data + (1 - tau) * target_param.data
                     )
                 # Use soft update to update the critic-target network parameters by infusing
                 # small amount of current parameters
                 for param, target_param in zip(
-                        self.critic.parameters(), self.critic_target.parameters()
+                    self.critic.parameters(), self.critic_target.parameters()
                 ):
                     target_param.data.copy_(
                         tau * param.data + (1 - tau) * target_param.data
@@ -279,22 +280,25 @@ def train(
             "train/av_max_bound_loss", av_max_bound_loss / iterations, self.iter_count
         )
         self.writer.add_scalar("train/avg_Q", av_Q / iterations, self.iter_count)
+        self.writer.add_scalar(
+            "train/avg_bound", av_bound / iterations, self.iter_count
+        )
         self.writer.add_scalar("train/max_b", max_b, self.iter_count)
         self.writer.add_scalar("train/max_Q", max_Q, self.iter_count)
         if self.save_every > 0 and self.iter_count % self.save_every == 0:
             self.save(filename=self.model_name, directory=self.save_directory)
 
     def get_max_bound(
-            self,
-            next_state,
-            discount,
-            max_ang_vel,
-            max_lin_vel,
-            time_step,
-            distance_norm,
-            goal_reward,
-            reward,
-            done,
+        self,
+        next_state,
+        discount,
+        max_ang_vel,
+        max_lin_vel,
+        time_step,
+        distance_norm,
+        goal_reward,
+        reward,
+        done,
     ):
         cos = next_state[:, -4]
         sin = next_state[:, -3]
@@ -304,7 +308,7 @@ def get_max_bound(
         full_turn_steps = torch.floor(turn_steps.abs())
         turn_rew = [
             (
-                -1 * discount ** step * max_ang_vel
+                -1 * discount**step * max_ang_vel
                 if step
                 else torch.zeros(1, device=self.device)
             )
@@ -326,20 +330,20 @@ def get_max_bound(
         final_steps = torch.ceil(distances) + full_turn_steps
         inter_steps = torch.trunc(distances) + full_turn_steps
         final_discount = torch.tensor(
-            [discount ** pw for pw in final_steps], device=self.device
+            [discount**pw for pw in final_steps], device=self.device
         )
         final_rew = (
-                torch.ones_like(distances, device=self.device)
-                * goal_reward
-                * final_discount
+            torch.ones_like(distances, device=self.device)
+            * goal_reward
+            * final_discount
         )
 
         max_inter_steps = inter_steps.max()
         exponents = torch.arange(
             1, max_inter_steps + 1, dtype=torch.float32, device=self.device
         )
         discount_exponents = torch.tensor(
-            [discount ** e for e in exponents], device=self.device
+            [discount**e for e in exponents], device=self.device
         )
         inter_rew = torch.tensor(
             [
diff --git a/robot_nav/models/BPG/BCNNTD3.py b/robot_nav/models/BPG/BCNNTD3.py
@@ -186,6 +186,7 @@ def train(
         time_step=0.3,
     ):
         av_Q = 0
+        av_bound = 0
         max_b = 0
         max_Q = -inf
         av_loss = 0
@@ -242,12 +243,11 @@ def train(
                 reward,
             )
             max_b += max(max_b, torch.max(max_bound))
-            max_bound_loss_Q1 = current_Q1 - max_bound
-            max_bound_loss_Q2 = current_Q2 - max_bound
-            max_bound_loss_Q1[max_bound_loss_Q1 < 0] = 0
-            max_bound_loss_Q2[max_bound_loss_Q2 < 0] = 0
-            max_bound_loss_Q1 = torch.square(max_bound_loss_Q1).mean()
-            max_bound_loss_Q2 = torch.square(max_bound_loss_Q1).mean()
+            av_bound += torch.mean(max_bound)
+            max_bound_Q1 = torch.min(current_Q1, max_bound)
+            max_bound_loss_Q1 = F.mse_loss(current_Q1, max_bound_Q1)
+            max_bound_Q2 = torch.min(current_Q2, max_bound)
+            max_bound_loss_Q2 = F.mse_loss(current_Q2, max_bound_Q2)
 
             # Calculate the loss between the current Q value and the target Q value
             loss_target_Q = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
@@ -264,6 +264,7 @@ def train(
                 # Maximize the actor output value by performing gradient descent on negative Q values
                 # (essentially perform gradient ascent)
                 actor_grad, _ = self.critic(state, self.actor(state))
+                actor_grad = torch.min(actor_grad, max_bound)
                 actor_grad = -actor_grad.mean()
                 self.actor_optimizer.zero_grad()
                 actor_grad.backward()
@@ -299,6 +300,9 @@ def train(
             "train/av_max_bound_loss", av_max_bound_loss / iterations, self.iter_count
         )
         self.writer.add_scalar("train/avg_Q", av_Q / iterations, self.iter_count)
+        self.writer.add_scalar(
+            "train/avg_bound", av_bound / iterations, self.iter_count
+        )
         self.writer.add_scalar("train/max_b", max_b, self.iter_count)
         self.writer.add_scalar("train/max_Q", max_Q, self.iter_count)
         if self.save_every > 0 and self.iter_count % self.save_every == 0:
diff --git a/robot_nav/models/BPG/BPG.py b/robot_nav/models/BPG/BPG.py
@@ -123,6 +123,7 @@ def train(
         time_step=0.3,
     ):
         av_Q = 0
+        av_bound = 0
         max_b = 0
         max_Q = -inf
         av_loss = 0
@@ -178,11 +179,10 @@ def train(
                 done,
             )
             max_b = max(max_b, torch.max(max_bound))
-            max_bound_loss_Q = current_Q - max_bound
-            max_bound_loss_Q[max_bound_loss_Q < 0] = 0
-            max_bound_loss_Q = torch.square(max_bound_loss_Q).mean()
-            max_bound_loss = max_bound_loss_Q
+            av_bound += torch.mean(max_bound)
 
+            max_bound_Q = torch.min(current_Q, max_bound)
+            max_bound_loss = F.mse_loss(current_Q, max_bound_Q)
             # Calculate the loss between the current Q value and the target Q value
             loss_target_Q = F.mse_loss(current_Q, target_Q)
 
@@ -197,6 +197,7 @@ def train(
                 # Maximize the actor output value by performing gradient descent on negative Q values
                 # (essentially perform gradient ascent)
                 actor_grad = self.critic(state, self.actor(state))
+                actor_grad = torch.min(actor_grad, max_bound)
                 actor_grad = -actor_grad.mean()
                 self.actor_optimizer.zero_grad()
                 actor_grad.backward()
@@ -232,6 +233,9 @@ def train(
             "train/av_max_bound_loss", av_max_bound_loss / iterations, self.iter_count
         )
         self.writer.add_scalar("train/avg_Q", av_Q / iterations, self.iter_count)
+        self.writer.add_scalar(
+            "train/avg_bound", av_bound / iterations, self.iter_count
+        )
         self.writer.add_scalar("train/max_b", max_b, self.iter_count)
         self.writer.add_scalar("train/max_Q", max_Q, self.iter_count)
         if self.save_every > 0 and self.iter_count % self.save_every == 0:
diff --git a/robot_nav/models/BPG/BTD3.py b/robot_nav/models/BPG/BTD3.py
@@ -140,6 +140,7 @@ def train(
         time_step=0.3,
     ):
         av_Q = 0
+        av_bound = 0
         max_b = 0
         max_Q = -inf
         av_loss = 0
@@ -196,12 +197,11 @@ def train(
                 reward,
             )
             max_b += max(max_b, torch.max(max_bound))
-            max_bound_loss_Q1 = current_Q1 - max_bound
-            max_bound_loss_Q2 = current_Q2 - max_bound
-            max_bound_loss_Q1[max_bound_loss_Q1 < 0] = 0
-            max_bound_loss_Q2[max_bound_loss_Q2 < 0] = 0
-            max_bound_loss_Q1 = torch.square(max_bound_loss_Q1).mean()
-            max_bound_loss_Q2 = torch.square(max_bound_loss_Q1).mean()
+            av_bound += torch.mean(max_bound)
+            max_bound_Q1 = torch.min(current_Q1, max_bound)
+            max_bound_loss_Q1 = F.mse_loss(current_Q1, max_bound_Q1)
+            max_bound_Q2 = torch.min(current_Q2, max_bound)
+            max_bound_loss_Q2 = F.mse_loss(current_Q2, max_bound_Q2)
 
             # Calculate the loss between the current Q value and the target Q value
             loss_target_Q = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
@@ -218,6 +218,7 @@ def train(
                 # Maximize the actor output value by performing gradient descent on negative Q values
                 # (essentially perform gradient ascent)
                 actor_grad, _ = self.critic(state, self.actor(state))
+                actor_grad = torch.min(actor_grad, max_bound)
                 actor_grad = -actor_grad.mean()
                 self.actor_optimizer.zero_grad()
                 actor_grad.backward()
@@ -253,6 +254,9 @@ def train(
             "train/av_max_bound_loss", av_max_bound_loss / iterations, self.iter_count
         )
         self.writer.add_scalar("train/avg_Q", av_Q / iterations, self.iter_count)
+        self.writer.add_scalar(
+            "train/avg_bound", av_bound / iterations, self.iter_count
+        )
         self.writer.add_scalar("train/max_b", max_b, self.iter_count)
         self.writer.add_scalar("train/max_Q", max_Q, self.iter_count)
         if self.save_every > 0 and self.iter_count % self.save_every == 0:
diff --git a/robot_nav/robot_world.yaml b/robot_nav/robot_world.yaml
@@ -3,7 +3,7 @@ world:
   width: 10   # the height of the world
   step_time: 0.3  # 10Hz calculate each step
   sample_time: 0.3  # 10 Hz for render and data extraction
-  collision_mode: 'react'
+  collision_mode: 'reactive'
 
 robot:
   - kinematics: {name: 'diff'}
diff --git a/robot_nav/test_random.py b/robot_nav/test_random.py
@@ -19,21 +19,21 @@ def main(args=None):
     """Main testing function"""
     action_dim = 2  # number of actions produced by the model
     max_action = 1  # maximum absolute value of output actions
-    state_dim = 25  # number of input values in the neural network (vector length of state input)
+    state_dim = 185  # number of input values in the neural network (vector length of state input)
     device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu"
     )  # using cuda if it is available, cpu otherwise
     epoch = 0  # epoch number
     max_steps = 300  # maximum number of steps in single episode
     test_scenarios = 1000
 
-    model = DDPG(
+    model = BPG(
         state_dim=state_dim,
         action_dim=action_dim,
         max_action=max_action,
         device=device,
         load_model=True,
-        model_name="DDPGexp5",
+        model_name="tryBPGw025exp5",
     )  # instantiate a model
 
     sim = SIM_ENV(
diff --git a/robot_nav/train.py b/robot_nav/train.py
@@ -39,14 +39,14 @@ def main(args=None):
     )
     save_every = 5  # save the model every n training cycles
 
-    model = BCNNPG(
+    model = BPG(
         state_dim=state_dim,
         action_dim=action_dim,
         max_action=max_action,
         device=device,
         save_every=save_every,
         load_model=False,
-        model_name="BCNNPGw025exp1",
+        model_name="tryBPGw025exp5",
         bound_weight=0.25,
     )  # instantiate a model