reiniscimurs · reiniscimurs · Jul 2, 2025 · Jul 2, 2025
diff --git a/docs/api/IR-SIM/ir-marl-sim.md b/docs/api/IR-SIM/ir-marl-sim.md
@@ -0,0 +1,6 @@
+# MARL-IR-SIM
+
+::: robot_nav.SIM_ENV.marl_sim
+    options:
+      show_root_heading: true
+      show_source: true
diff --git a/docs/api/IR-SIM/ir-sim.md b/docs/api/IR-SIM/ir-sim.md
@@ -1,6 +1,6 @@
 # IR-SIM
 
-::: robot_nav.sim
+::: robot_nav.SIM_ENV.sim
     options:
       show_root_heading: true
       show_source: true
diff --git a/docs/api/models/MARL/Attention.md b/docs/api/models/MARL/Attention.md
@@ -0,0 +1,6 @@
+# Hard-Soft Attention
+
+::: robot_nav.models.MARL.hardsoftAttention
+    options:
+      show_root_heading: true
+      show_source: true
diff --git a/docs/api/models/MARL/TD3.md b/docs/api/models/MARL/TD3.md
@@ -0,0 +1,6 @@
+# MARL TD3
+
+::: robot_nav.models.MARL.marlTD3
+    options:
+      show_root_heading: true
+      show_source: true
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -4,7 +4,9 @@ extra:
 nav:
 - Home: index.md
 - API Reference:
-    IR-SIM: api/IR-SIM/ir-sim
+    IR-SIM:
+      - SIM: api/IR-SIM/ir-sim
+      - MARL SIM: api/IR-SIM/ir-marl-sim
     Models:
       - DDPG: api/models/DDPG.md
       - TD3: api/models/TD3.md
@@ -13,6 +15,9 @@ nav:
       - HCM: api/models/HCM.md
       - PPO: api/models/PPO.md
       - SAC: api/models/SAC.md
+      - MARL:
+          - HardSoft Attention: api/models/MARL/Attention
+          - TD3: api/models/MARL/TD3
     Training:
       - Train: api/Training/train.md
       - Train RNN: api/Training/trainrnn.md

diff --git a/robot_nav/SIM_ENV/marl_sim.py b/robot_nav/SIM_ENV/marl_sim.py
@@ -8,23 +8,26 @@
 
 class MARL_SIM(SIM_ENV):
     """
-    A simulation environment interface for robot navigation using IRSim in MARL setting.
+    Simulation environment for multi-agent robot navigation using IRSim.
 
-    This class wraps around the IRSim environment and provides methods for stepping,
-    resetting, and interacting with mobile robots, including reward computation.
+    This class extends the SIM_ENV and provides a wrapper for multi-robot
+    simulation and interaction, supporting reward computation and custom reset logic.
 
     Attributes:
-        env (object): The simulation environment instance from IRSim.
-        robot_goal (np.ndarray): The goal position of the robot.
+        env (object): IRSim simulation environment instance.
+        robot_goal (np.ndarray): Current goal position(s) for the robots.
+        num_robots (int): Number of robots in the environment.
+        x_range (tuple): World x-range.
+        y_range (tuple): World y-range.
     """
 
     def __init__(self, world_file="multi_robot_world.yaml", disable_plotting=False):
         """
-        Initialize the simulation environment.
+        Initialize the MARL_SIM environment.
 
         Args:
-            world_file (str): Path to the world configuration YAML file.
-            disable_plotting (bool): If True, disables rendering and plotting.
+            world_file (str, optional): Path to the world configuration YAML file.
+            disable_plotting (bool, optional): If True, disables IRSim rendering and plotting.
         """
         display = False if disable_plotting else True
         self.env = irsim.make(
@@ -38,15 +41,26 @@ def __init__(self, world_file="multi_robot_world.yaml", disable_plotting=False):
 
     def step(self, action, connection, combined_weights=None):
         """
-        Perform one step in the simulation using the given control commands.
+        Perform a simulation step for all robots using the provided actions and connections.
 
         Args:
-            lin_velocity (float): Linear velocity to apply to the robot.
-            ang_velocity (float): Angular velocity to apply to the robot.
+            action (list): List of actions for each robot [[lin_vel, ang_vel], ...].
+            connection (Tensor): Tensor of shape (num_robots, num_robots-1) containing logits indicating connections between robots.
+            combined_weights (Tensor or None, optional): Optional weights for each connection, shape (num_robots, num_robots-1).
 
         Returns:
-            (tuple): Contains the latest LIDAR scan, distance to goal, cosine and sine of angle to goal,
-                   collision flag, goal reached flag, applied action, and computed reward.
+            tuple: (
+                poses (list): List of [x, y, theta] for each robot,
+                distances (list): Distance to goal for each robot,
+                coss (list): Cosine of angle to goal for each robot,
+                sins (list): Sine of angle to goal for each robot,
+                collisions (list): Collision status for each robot,
+                goals (list): Goal reached status for each robot,
+                action (list): Actions applied,
+                rewards (list): Rewards computed,
+                positions (list): Current [x, y] for each robot,
+                goal_positions (list): Goal [x, y] for each robot,
+            )
         """
         self.env.step(action_id=[i for i in range(self.num_robots)], action=action)
         self.env.render()
@@ -166,17 +180,27 @@ def reset(
         random_obstacle_ids=None,
     ):
         """
-        Reset the simulation environment, optionally setting robot and obstacle states.
+        Reset the simulation environment and optionally set robot and obstacle positions.
 
         Args:
-            robot_state (list or None): Initial state of the robot as a list of [x, y, theta, speed].
-            robot_goal (list or None): Goal state for the robot.
-            random_obstacles (bool): Whether to randomly reposition obstacles.
-            random_obstacle_ids (list or None): Specific obstacle IDs to randomize.
+            robot_state (list or None, optional): Initial state for robots as [x, y, theta, speed].
+            robot_goal (list or None, optional): Goal position(s) for the robots.
+            random_obstacles (bool, optional): If True, randomly position obstacles.
+            random_obstacle_ids (list or None, optional): IDs of obstacles to randomize.
 
         Returns:
-            (tuple): Initial observation after reset, including LIDAR scan, distance, cos/sin,
-                   and reward-related flags and values.
+            tuple: (
+                poses (list): List of [x, y, theta] for each robot,
+                distances (list): Distance to goal for each robot,
+                coss (list): Cosine of angle to goal for each robot,
+                sins (list): Sine of angle to goal for each robot,
+                collisions (list): All False after reset,
+                goals (list): All False after reset,
+                action (list): Initial action ([[0.0, 0.0], ...]),
+                rewards (list): Rewards for initial state,
+                positions (list): Initial [x, y] for each robot,
+                goal_positions (list): Initial goal [x, y] for each robot,
+            )
         """
         if robot_state is None:
             robot_state = [[random.uniform(3, 9)], [random.uniform(3, 9)], [0]]
@@ -254,18 +278,18 @@ def reset(
     @staticmethod
     def get_reward(goal, collision, action, closest_robots, distance, phase=1):
         """
-        Calculate the reward for the current step.
+        Calculate the reward for a robot given the current state and action.
 
         Args:
-            goal (bool): Whether the goal has been reached.
+            goal (bool): Whether the robot reached its goal.
             collision (bool): Whether a collision occurred.
-            action (list): The action taken [linear velocity, angular velocity].
-            closest_robots (list): Distances to the closest robots.
-            distance (float): Distance to goal.
-            phase (int, optional): Reward function phase. Defaults to 1.
+            action (list): [linear_velocity, angular_velocity] applied.
+            closest_robots (list): Distances to the closest other robots.
+            distance (float): Distance to the goal.
+            phase (int, optional): Reward phase/function selector (default: 1).
 
         Returns:
-            (float): Computed reward for the current state.
+            float: Computed reward.
         """
 
         match phase:

diff --git a/robot_nav/models/MARL/__init__.py b/robot_nav/models/MARL/__init__.py
diff --git a/robot_nav/models/MARL/hardsoftAttention.py b/robot_nav/models/MARL/hardsoftAttention.py
@@ -5,7 +5,33 @@
 
 
 class Attention(nn.Module):
+    """
+    Multi-robot attention mechanism for learning hard and soft attentions.
+
+    This module provides both hard (binary) and soft (weighted) attention,
+    combining feature encoding, relative pose and goal geometry, and
+    message passing between agents.
+
+    Args:
+        embedding_dim (int): Dimension of the agent embedding vector.
+
+    Attributes:
+        embedding1 (nn.Linear): First layer for agent feature encoding.
+        embedding2 (nn.Linear): Second layer for agent feature encoding.
+        hard_mlp (nn.Sequential): MLP to process concatenated agent and edge features.
+        hard_encoding (nn.Linear): Outputs logits for hard (binary) attention.
+        q, k, v (nn.Linear): Layers for query, key, value projections for soft attention.
+        attn_score_layer (nn.Sequential): Computes unnormalized attention scores for each pair.
+        decode_1, decode_2 (nn.Linear): Decoding layers to produce the final attended embedding.
+    """
+
     def __init__(self, embedding_dim):
+        """
+        Initialize attention mechanism for multi-agent communication.
+
+        Args:
+            embedding_dim (int): Output embedding dimension per agent.
+        """
         super(Attention, self).__init__()
         self.embedding_dim = embedding_dim
 
@@ -41,30 +67,59 @@ def __init__(self, embedding_dim):
         nn.init.kaiming_uniform_(self.decode_2.weight, nonlinearity="leaky_relu")
 
     def encode_agent_features(self, embed):
+        """
+        Encode agent features using a small MLP.
+
+        Args:
+            embed (Tensor): Input features (B*N, 5).
+
+        Returns:
+            Tensor: Encoded embedding (B*N, embedding_dim).
+        """
         embed = F.leaky_relu(self.embedding1(embed))
         embed = F.leaky_relu(self.embedding2(embed))
         return embed
 
     def forward(self, embedding):
+        """
+        Forward pass: computes both hard and soft attentions among agents,
+        produces the attended embedding for each agent, as well as diagnostic info.
+
+        Args:
+            embedding (Tensor): Input tensor of shape (B, N, D), where D is at least 11.
+
+        Returns:
+            tuple:
+                att_embedding (Tensor): Final attended embedding, shape (B*N, 2*embedding_dim).
+                hard_logits (Tensor): Logits for hard attention, (B*N, N-1).
+                unnorm_rel_dist (Tensor): Pairwise distances between agents (not normalized), (B*N, N-1, 1).
+                mean_entropy (Tensor): Mean entropy of soft attention distributions.
+                hard_weights (Tensor): Binary hard attention mask, (B, N, N-1).
+                comb_w (Tensor): Final combined attention weights, (N, N*(N-1)).
+        """
         if embedding.dim() == 2:
             embedding = embedding.unsqueeze(0)
         batch_size, n_agents, _ = embedding.shape
 
+        # Extract sub-features
         embed = embedding[:, :, 4:9].reshape(batch_size * n_agents, -1)
         position = embedding[:, :, :2].reshape(batch_size, n_agents, 2)
         heading = embedding[:, :, 2:4].reshape(
             batch_size, n_agents, 2
         )  # assume (cos(θ), sin(θ))
         action = embedding[:, :, 7:9].reshape(batch_size, n_agents, 2)
         goal = embedding[:, :, -2:].reshape(batch_size, n_agents, 2)
+
+        # Compute pairwise relative goal vectors (for each i,j)
         goal_j = goal.unsqueeze(1).expand(-1, n_agents, -1, -1)
         pos_i = position.unsqueeze(2)
         goal_rel_vec = goal_j - pos_i
 
+        # Encode agent features
         agent_embed = self.encode_agent_features(embed)
         agent_embed = agent_embed.view(batch_size, n_agents, self.embedding_dim)
 
-        # For hard attention
+        # Prep for hard attention: compute all relative geometry for each agent pair
         h_i = agent_embed.unsqueeze(2)  # (B, N, 1, D)
         pos_i = position.unsqueeze(2)  # (B, N, 1, 2)
         pos_j = position.unsqueeze(1)  # (B, 1, N, 2)
@@ -88,7 +143,7 @@ def forward(self, embedding):
         heading_j_cos = heading_j[..., 0]  # (B, 1, N)
         heading_j_sin = heading_j[..., 1]  # (B, 1, N)
 
-        # Stack edge features
+        # Edge features for hard attention
         edge_features = torch.cat(
             [
                 rel_dist,  # (B, N, N, 1)
@@ -101,7 +156,7 @@ def forward(self, embedding):
             dim=-1,
         )
 
-        # Broadcast h_i along N (for each pair)
+        # Broadcast agent embedding for all pairs (except self-pairs)
         h_i_expanded = h_i.expand(-1, -1, n_agents, -1)
 
         # Remove self-pairs using mask
@@ -129,13 +184,14 @@ def forward(self, embedding):
             batch_size * n_agents, n_agents - 1, 1
         )
 
-        # Soft attention
+        # ---- Soft attention computation ----
         q = self.q(agent_embed)
 
         attention_outputs = []
         entropy_list = []
         combined_w = []
 
+        # Goal-relative polar features for soft attention
         goal_rel_dist = torch.linalg.vector_norm(goal_rel_vec, dim=-1, keepdim=True)
         goal_angle_global = torch.atan2(goal_rel_vec[..., 1], goal_rel_vec[..., 0])
         heading_angle = torch.atan2(heading_i[..., 1], heading_i[..., 0])
@@ -147,6 +203,7 @@ def forward(self, embedding):
             [goal_rel_dist, goal_rel_angle_cos, goal_rel_angle_sin], dim=-1
         )
 
+        # Soft attention edge features (include goal polar)
         soft_edge_features = torch.cat([edge_features, goal_polar], dim=-1)
         for i in range(n_agents):
             q_i = q[:, i : i + 1, :]
@@ -159,10 +216,10 @@ def forward(self, embedding):
             q_i_expanded = q_i.expand(-1, n_agents - 1, -1)
             attention_input = torch.cat([q_i_expanded, k], dim=-1)
 
-            # Score computation
+            # Score computation (per pair)
             scores = self.attn_score_layer(attention_input).transpose(1, 2)
 
-            # Mask using hard weights
+            # Mask using hard attention
             h_weights = hard_weights[:, i].unsqueeze(1)
             mask = (h_weights > 0.5).float()
 
@@ -183,12 +240,12 @@ def forward(self, embedding):
             combined_weights = soft_weights * mask  # (B, 1, N-1)
             combined_w.append(combined_weights)
 
-            # Normalize combined_weights for entropy calculation
+            # Normalize for entropy calculation
             combined_weights_norm = combined_weights / (
                 combined_weights.sum(dim=-1, keepdim=True) + epsilon
             )
 
-            # Calculate entropy from combined_weights
+            # Entropy for analysis/logging
             entropy = (
                 -(combined_weights_norm * (combined_weights_norm + epsilon).log())
                 .sum(dim=-1)