second refactor

reiniscimurs · reiniscimurs · commit 564294f48796 · 2025-07-02T11:45:49.000+02:00
diff --git a/robot_nav/SIM_ENV/marl_sim.py b/robot_nav/SIM_ENV/marl_sim.py
@@ -8,10 +8,10 @@
 
 class MARL_SIM(SIM_ENV):
     """
-    A simulation environment interface for robot navigation using IRSim.
+    A simulation environment interface for robot navigation using IRSim in MARL setting.
 
     This class wraps around the IRSim environment and provides methods for stepping,
-    resetting, and interacting with a mobile robot, including reward computation.
+    resetting, and interacting with mobile robots, including reward computation.
 
     Attributes:
         env (object): The simulation environment instance from IRSim.
@@ -33,6 +33,8 @@ def __init__(self, world_file="multi_robot_world.yaml", disable_plotting=False):
         robot_info = self.env.get_robot_info(0)
         self.robot_goal = robot_info.goal
         self.num_robots = len(self.env.robot_list)
+        self.x_range = self.env._world.x_range
+        self.y_range = self.env._world.y_range
 
     def step(self, action, connection, combined_weights=None):
         """
@@ -46,7 +48,6 @@ def step(self, action, connection, combined_weights=None):
             (tuple): Contains the latest LIDAR scan, distance to goal, cosine and sine of angle to goal,
                    collision flag, goal reached flag, applied action, and computed reward.
         """
-        # action = [[lin_velocity, ang_velocity], [lin_velocity, ang_velocity], [lin_velocity, ang_velocity], [lin_velocity, ang_velocity], [lin_velocity, ang_velocity]]
         self.env.step(action_id=[i for i in range(self.num_robots)], action=action)
         self.env.render()
 
@@ -139,8 +140,8 @@ def step(self, action, connection, combined_weights=None):
                     obstacle_list=self.env.obstacle_list,
                     init=True,
                     range_limits=[
-                        [1, 1, -3.141592653589793],
-                        [11, 11, 3.141592653589793],
+                        [self.x_range[0] + 1, self.y_range[0] + 1, -3.141592653589793],
+                        [self.x_range[1] - 1, self.y_range[1] - 1, 3.141592653589793],
                     ],
                 )
 
@@ -209,8 +210,8 @@ def reset(
             if random_obstacle_ids is None:
                 random_obstacle_ids = [i + self.num_robots for i in range(7)]
             self.env.random_obstacle_position(
-                range_low=[0, 0, -3.14],
-                range_high=[12, 12, 3.14],
+                range_low=[self.x_range[0], self.y_range[0], -3.14],
+                range_high=[self.x_range[1], self.y_range[1], 3.14],
                 ids=random_obstacle_ids,
                 non_overlapping=True,
             )
@@ -221,8 +222,8 @@ def reset(
                     obstacle_list=self.env.obstacle_list,
                     init=True,
                     range_limits=[
-                        [1, 1, -3.141592653589793],
-                        [11, 11, 3.141592653589793],
+                        [self.x_range[0] + 1, self.y_range[0] + 1, -3.141592653589793],
+                        [self.x_range[1] - 1, self.y_range[1] - 1, 3.141592653589793],
                     ],
                 )
             else:
@@ -251,44 +252,49 @@ def reset(
         )
 
     @staticmethod
-    def get_reward(goal, collision, action, closest_robots, distance):
+    def get_reward(goal, collision, action, closest_robots, distance, phase=1):
         """
         Calculate the reward for the current step.
 
         Args:
             goal (bool): Whether the goal has been reached.
             collision (bool): Whether a collision occurred.
             action (list): The action taken [linear velocity, angular velocity].
-            laser_scan (list): The LIDAR scan readings.
+            closest_robots (list): Distances to the closest robots.
+            distance (float): Distance to goal.
+            phase (int, optional): Reward function phase. Defaults to 1.
 
         Returns:
             (float): Computed reward for the current state.
         """
 
-        # phase1
-        if goal:
-            return 100.0
-        elif collision:
-            return -100.0 * 3 * action[0]
-        else:
-            r_dist = 1.5 / distance
-            cl_pen = 0
-            for rob in closest_robots:
-                add = 1.5 - rob if rob < 1.5 else 0
-                cl_pen += add
-
-            return action[0] - 0.5 * abs(action[1]) - cl_pen + r_dist
-
-        # phase2
-        # if goal:
-        #     return 70.0
-        # elif collision:
-        #     return -100.0 * 3 * action[0]
-        # else:
-        #     r_dist = 1.5 / distance
-        #     cl_pen = 0
-        #     for rob in closest_robots:
-        #         add = (3 - rob)**2 if rob < 3 else 0
-        #         cl_pen += add
-        #
-        #     return -0.5 * abs(action[1]) - cl_pen
+        match phase:
+            case 1:
+                if goal:
+                    return 100.0
+                elif collision:
+                    return -100.0 * 3 * action[0]
+                else:
+                    r_dist = 1.5 / distance
+                    cl_pen = 0
+                    for rob in closest_robots:
+                        add = 1.5 - rob if rob < 1.5 else 0
+                        cl_pen += add
+
+                    return action[0] - 0.5 * abs(action[1]) - cl_pen + r_dist
+
+            case 2:
+                if goal:
+                    return 70.0
+                elif collision:
+                    return -100.0 * 3 * action[0]
+                else:
+                    cl_pen = 0
+                    for rob in closest_robots:
+                        add = (3 - rob) ** 2 if rob < 3 else 0
+                        cl_pen += add
+
+                    return -0.5 * abs(action[1]) - cl_pen
+
+            case _:
+                raise ValueError("Unknown reward phase")
diff --git a/robot_nav/models/CNNTD3/CNNTD3.py b/robot_nav/models/CNNTD3/CNNTD3.py
@@ -325,8 +325,8 @@ def train(
             state = torch.Tensor(batch_states).to(self.device)
             next_state = torch.Tensor(batch_next_states).to(self.device)
             action = torch.Tensor(batch_actions).to(self.device)
-            reward = torch.Tensor(batch_rewards).to(self.device)
-            done = torch.Tensor(batch_dones).to(self.device)
+            reward = torch.Tensor(batch_rewards).to(self.device).reshape(-1, 1)
+            done = torch.Tensor(batch_dones).to(self.device).reshape(-1, 1)
 
             # Obtain the estimated action from the next state by using the actor-target
             next_action = self.actor_target(next_state)
diff --git a/robot_nav/models/DDPG/DDPG.py b/robot_nav/models/DDPG/DDPG.py
@@ -254,8 +254,8 @@ def train(
             state = torch.Tensor(batch_states).to(self.device)
             next_state = torch.Tensor(batch_next_states).to(self.device)
             action = torch.Tensor(batch_actions).to(self.device)
-            reward = torch.Tensor(batch_rewards).to(self.device)
-            done = torch.Tensor(batch_dones).to(self.device)
+            reward = torch.Tensor(batch_rewards).to(self.device).reshape(-1, 1)
+            done = torch.Tensor(batch_dones).to(self.device).reshape(-1, 1)
 
             # Obtain the estimated action from the next state by using the actor-target
             next_action = self.actor_target(next_state)
diff --git a/robot_nav/models/MARL/hsAttention.py b/robot_nav/models/MARL/hsAttention.py
@@ -9,7 +9,6 @@ def __init__(self, embedding_dim):
         super(Attention, self).__init__()
         self.embedding_dim = embedding_dim
 
-        # CNN for laser scan
         self.embedding1 = nn.Linear(5, 128)
         nn.init.kaiming_uniform_(self.embedding1.weight, nonlinearity="leaky_relu")
         self.embedding2 = nn.Linear(128, embedding_dim)
@@ -28,7 +27,7 @@ def __init__(self, embedding_dim):
         self.k = nn.Linear(10, embedding_dim, bias=False)
         self.v = nn.Linear(10, embedding_dim)
 
-        # Soft attention score network (with distance)
+        # Soft attention score network (with polar other robot goal position)
         self.attn_score_layer = nn.Sequential(
             nn.Linear(embedding_dim * 2, embedding_dim),
             nn.ReLU(),
@@ -58,8 +57,8 @@ def forward(self, embedding):
         )  # assume (cos(θ), sin(θ))
         action = embedding[:, :, 7:9].reshape(batch_size, n_agents, 2)
         goal = embedding[:, :, -2:].reshape(batch_size, n_agents, 2)
-        goal_j = goal.unsqueeze(1).expand(-1, n_agents, -1, -1)  # (B, N, N, 2)
-        pos_i = position.unsqueeze(2)  # (B, N, 1, 2)
+        goal_j = goal.unsqueeze(1).expand(-1, n_agents, -1, -1)
+        pos_i = position.unsqueeze(2)
         goal_rel_vec = goal_j - pos_i
 
         agent_embed = self.encode_agent_features(embed)
@@ -100,10 +99,10 @@ def forward(self, embedding):
                 action.unsqueeze(1).expand(-1, n_agents, -1, -1),  # (B, N, N, 2)
             ],
             dim=-1,
-        )  # (B, N, N, 7)
+        )
 
         # Broadcast h_i along N (for each pair)
-        h_i_expanded = h_i.expand(-1, -1, n_agents, -1)  # (B, N, N, D)
+        h_i_expanded = h_i.expand(-1, -1, n_agents, -1)
 
         # Remove self-pairs using mask
         mask = ~torch.eye(n_agents, dtype=torch.bool, device=embedding.device)
@@ -115,7 +114,7 @@ def forward(self, embedding):
         )
 
         # Concatenate agent embedding and edge features
-        hard_input = torch.cat([h_i_flat, edge_flat], dim=-1)  # (B*N, N-1, D+7)
+        hard_input = torch.cat([h_i_flat, edge_flat], dim=-1)
 
         # Hard attention forward
         h_hard = self.hard_mlp(hard_input)
@@ -125,8 +124,7 @@ def forward(self, embedding):
         ].unsqueeze(2)
         hard_weights = hard_weights.view(batch_size, n_agents, n_agents - 1)
 
-        unnorm_rel_vec = rel_vec
-        unnorm_rel_dist = torch.linalg.vector_norm(unnorm_rel_vec, dim=-1, keepdim=True)
+        unnorm_rel_dist = torch.linalg.vector_norm(rel_vec, dim=-1, keepdim=True)
         unnorm_rel_dist = unnorm_rel_dist[:, mask].reshape(
             batch_size * n_agents, n_agents - 1, 1
         )
@@ -151,23 +149,21 @@ def forward(self, embedding):
 
         soft_edge_features = torch.cat([edge_features, goal_polar], dim=-1)
         for i in range(n_agents):
-            q_i = q[:, i : i + 1, :]  # (B, 1, D)
+            q_i = q[:, i : i + 1, :]
             mask = torch.ones(n_agents, dtype=torch.bool, device=edge_features.device)
             mask[i] = False
             edge_i_wo_self = soft_edge_features[:, i, mask, :]
-            edge_i_wo_self = edge_i_wo_self.squeeze(1)  # (B, N-1, 7)
+            edge_i_wo_self = edge_i_wo_self.squeeze(1)
             k = F.leaky_relu(self.k(edge_i_wo_self))
 
-            q_i_expanded = q_i.expand(-1, n_agents - 1, -1)  # (B, N-1, D)
-            attention_input = torch.cat([q_i_expanded, k], dim=-1)  # (B, N-1, D+7)
+            q_i_expanded = q_i.expand(-1, n_agents - 1, -1)
+            attention_input = torch.cat([q_i_expanded, k], dim=-1)
 
             # Score computation
-            scores = self.attn_score_layer(attention_input).transpose(
-                1, 2
-            )  # (B, 1, N-1)
+            scores = self.attn_score_layer(attention_input).transpose(1, 2)
 
             # Mask using hard weights
-            h_weights = hard_weights[:, i].unsqueeze(1)  # (B, 1, N-1)
+            h_weights = hard_weights[:, i].unsqueeze(1)
             mask = (h_weights > 0.5).float()
 
             # All-zero mask handling
@@ -200,11 +196,8 @@ def forward(self, embedding):
             )
             entropy_list.append(entropy)
 
-            # Project each other agent's features to embedding dim *before* the attention-weighted sum
             v_j = F.leaky_relu(self.v(edge_i_wo_self))
-            attn_output = torch.bmm(combined_weights, v_j).squeeze(
-                1
-            )  # (B, embedding_dim)
+            attn_output = torch.bmm(combined_weights, v_j).squeeze(1)
             attention_outputs.append(attn_output)
 
         comb_w = torch.stack(combined_w, dim=1).reshape(n_agents, -1)
diff --git a/robot_nav/models/MARL/marlTD3.py b/robot_nav/models/MARL/marlTD3.py
@@ -215,6 +215,9 @@ def train(
         policy_noise=0.2,
         noise_clip=0.5,
         policy_freq=2,
+        bce_weight=0.1,
+        entropy_weight=1,
+        connection_proximity_threshold=4,
     ):
         av_Q = 0
         max_Q = -inf
@@ -298,19 +301,16 @@ def train(
                 current_Q2, target_Q
             )
 
-            proximity_threshold = 4  # You may need to adjust this
-            targets = (unnorm_rel_dist.flatten() < proximity_threshold).float()
+            targets = (
+                unnorm_rel_dist.flatten() < connection_proximity_threshold
+            ).float()
             flat_logits = hard_logits.flatten()
             bce_loss = F.binary_cross_entropy_with_logits(flat_logits, targets)
 
-            bce_weight = 0.1
             av_critic_bce_loss.append(bce_loss)
 
-            critic_entropy_weight = 1  # or tuneable
             total_loss = (
-                critic_loss
-                - critic_entropy_weight * mean_entropy
-                + bce_weight * bce_loss
+                critic_loss - entropy_weight * mean_entropy + bce_weight * bce_loss
             )
             av_critic_entropy.append(mean_entropy)
 
@@ -328,20 +328,18 @@ def train(
                 action, hard_logits, unnorm_rel_dist, mean_entropy, hard_weights, _ = (
                     self.actor(state, detach_attn=False)
                 )
-                targets = (unnorm_rel_dist.flatten() < proximity_threshold).float()
+                targets = (
+                    unnorm_rel_dist.flatten() < connection_proximity_threshold
+                ).float()
                 flat_logits = hard_logits.flatten()
                 bce_loss = F.binary_cross_entropy_with_logits(flat_logits, targets)
 
-                bce_weight = 0.1
                 av_actor_bce_loss.append(bce_loss)
 
                 actor_Q, _, _, _, _, _ = self.critic(state, action)
                 actor_loss = -actor_Q.mean()
-                actor_entropy_weight = 0.05
                 total_loss = (
-                    actor_loss
-                    - actor_entropy_weight * mean_entropy
-                    + bce_weight * bce_loss
+                    actor_loss - entropy_weight * mean_entropy + bce_weight * bce_loss
                 )
                 av_actor_entropy.append(mean_entropy)
 
@@ -458,9 +456,7 @@ def prepare_state(
             poses (list): Each agent's global pose [x, y, theta].
             distance, cos, sin: Unused, can be removed or ignored.
             collision (list): Collision flags per agent.
-            goal (list): Goal reached flags per agent.
             action (list): Last action taken [lin_vel, ang_vel].
-            positions (list): Extra features (e.g., neighbors).
             goal_positions (list): Each agent's goal [x, y].
 
         Returns:
@@ -483,7 +479,7 @@ def prepare_state(
             heading_sin = np.sin(theta)
 
             # Last velocity
-            lin_vel = act[0] * 2  # Assuming original range [-0.5, 0.5]
+            lin_vel = act[0] * 2  # Assuming original range [0, 0.5]
             ang_vel = (act[1] + 1) / 2  # Assuming original range [-1, 1]
 
             # Final state vector
diff --git a/robot_nav/models/SAC/SAC.py b/robot_nav/models/SAC/SAC.py
@@ -346,8 +346,8 @@ def update(self, replay_buffer, step, batch_size):
         state = torch.Tensor(batch_states).to(self.device)
         next_state = torch.Tensor(batch_next_states).to(self.device)
         action = torch.Tensor(batch_actions).to(self.device)
-        reward = torch.Tensor(batch_rewards).to(self.device)
-        done = torch.Tensor(batch_dones).to(self.device)
+        reward = torch.Tensor(batch_rewards).to(self.device).reshape(-1, 1)
+        done = torch.Tensor(batch_dones).to(self.device).reshape(-1, 1)
         self.train_metrics_dict["train/batch_reward_av"].append(
             batch_rewards.mean().item()
         )
diff --git a/robot_nav/models/TD3/TD3.py b/robot_nav/models/TD3/TD3.py
@@ -268,8 +268,8 @@ def train(
             state = torch.Tensor(batch_states).to(self.device)
             next_state = torch.Tensor(batch_next_states).to(self.device)
             action = torch.Tensor(batch_actions).to(self.device)
-            reward = torch.Tensor(batch_rewards).to(self.device)
-            done = torch.Tensor(batch_dones).to(self.device)
+            reward = torch.Tensor(batch_rewards).to(self.device).reshape(-1, 1)
+            done = torch.Tensor(batch_dones).to(self.device).reshape(-1, 1)
 
             # Obtain the estimated action from the next state by using the actor-target
             next_action = self.actor_target(next_state)
diff --git a/robot_nav/multi_robot_world.yaml b/robot_nav/multi_robot_world.yaml
@@ -1,8 +1,8 @@
 world:
   height: 12  # the height of the world
   width: 12   # the height of the world
-  step_time: 0.3  # 10Hz calculate each step
-  sample_time: 0.3  # 10 Hz for render and data extraction
+  step_time: 0.3  # Calculate each step
+  sample_time: 0.3  # For render and data extraction
   collision_mode: 'reactive'
 
 robot:
@@ -20,8 +20,3 @@ robot:
 
     plot:
       show_trajectory: False
-
-#obstacle:
-#  - shape: { name: 'linestring', vertices: [ [ 0, 0 ], [ 12, 0 ], [ 12, 12 ], [ 0, 12 ],[ 0, 0 ]  ] }  # vertices
-#    kinematics: {name: 'static'}
-#    state: [ 0, 0, 0 ]