diff --git a/docs/api/IR-SIM/ir-marl-sim.md b/docs/api/IR-SIM/ir-marl-sim.md
new file mode 100644
index 0000000..c576a23
--- /dev/null
+++ b/docs/api/IR-SIM/ir-marl-sim.md
@@ -0,0 +1,6 @@
+# MARL-IR-SIM
+
+::: robot_nav.SIM_ENV.marl_sim
+    options:
+      show_root_heading: true
+      show_source: true
\ No newline at end of file
diff --git a/docs/api/IR-SIM/ir-sim.md b/docs/api/IR-SIM/ir-sim.md
index 1807be7..e236b00 100644
--- a/docs/api/IR-SIM/ir-sim.md
+++ b/docs/api/IR-SIM/ir-sim.md
@@ -1,6 +1,6 @@
 # IR-SIM
 
-::: robot_nav.sim
+::: robot_nav.SIM_ENV.sim
     options:
       show_root_heading: true
       show_source: true
\ No newline at end of file
diff --git a/docs/api/models/MARL/Attention.md b/docs/api/models/MARL/Attention.md
new file mode 100644
index 0000000..361fa59
--- /dev/null
+++ b/docs/api/models/MARL/Attention.md
@@ -0,0 +1,6 @@
+# Hard-Soft Attention
+
+::: robot_nav.models.MARL.hardsoftAttention
+    options:
+      show_root_heading: true
+      show_source: true
diff --git a/docs/api/models/MARL/TD3.md b/docs/api/models/MARL/TD3.md
new file mode 100644
index 0000000..ac99d61
--- /dev/null
+++ b/docs/api/models/MARL/TD3.md
@@ -0,0 +1,6 @@
+# MARL TD3
+
+::: robot_nav.models.MARL.marlTD3
+    options:
+      show_root_heading: true
+      show_source: true
diff --git a/mkdocs.yml b/mkdocs.yml
index fa58dd0..6eccc3c 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -4,7 +4,9 @@ extra:
 nav:
 - Home: index.md
 - API Reference:
-    IR-SIM: api/IR-SIM/ir-sim
+    IR-SIM:
+      - SIM: api/IR-SIM/ir-sim
+      - MARL SIM: api/IR-SIM/ir-marl-sim
     Models:
       - DDPG: api/models/DDPG.md
       - TD3: api/models/TD3.md
@@ -13,6 +15,9 @@ nav:
       - HCM: api/models/HCM.md
       - PPO: api/models/PPO.md
       - SAC: api/models/SAC.md
+      - MARL:
+          - HardSoft Attention: api/models/MARL/Attention
+          - TD3: api/models/MARL/TD3
     Training:
       - Train: api/Training/train.md
       - Train RNN: api/Training/trainrnn.md
diff --git a/robot_nav/SIM_ENV/marl_sim.py b/robot_nav/SIM_ENV/marl_sim.py
index 8fd10ba..4016786 100644
--- a/robot_nav/SIM_ENV/marl_sim.py
+++ b/robot_nav/SIM_ENV/marl_sim.py
@@ -8,23 +8,26 @@
 
 class MARL_SIM(SIM_ENV):
     """
-    A simulation environment interface for robot navigation using IRSim in MARL setting.
+    Simulation environment for multi-agent robot navigation using IRSim.
 
-    This class wraps around the IRSim environment and provides methods for stepping,
-    resetting, and interacting with mobile robots, including reward computation.
+    This class extends the SIM_ENV and provides a wrapper for multi-robot
+    simulation and interaction, supporting reward computation and custom reset logic.
 
     Attributes:
-        env (object): The simulation environment instance from IRSim.
-        robot_goal (np.ndarray): The goal position of the robot.
+        env (object): IRSim simulation environment instance.
+        robot_goal (np.ndarray): Current goal position(s) for the robots.
+        num_robots (int): Number of robots in the environment.
+        x_range (tuple): World x-range.
+        y_range (tuple): World y-range.
     """
 
     def __init__(self, world_file="multi_robot_world.yaml", disable_plotting=False):
         """
-        Initialize the simulation environment.
+        Initialize the MARL_SIM environment.
 
         Args:
-            world_file (str): Path to the world configuration YAML file.
-            disable_plotting (bool): If True, disables rendering and plotting.
+            world_file (str, optional): Path to the world configuration YAML file.
+            disable_plotting (bool, optional): If True, disables IRSim rendering and plotting.
         """
         display = False if disable_plotting else True
         self.env = irsim.make(
@@ -38,15 +41,26 @@ def __init__(self, world_file="multi_robot_world.yaml", disable_plotting=False):
 
     def step(self, action, connection, combined_weights=None):
         """
-        Perform one step in the simulation using the given control commands.
+        Perform a simulation step for all robots using the provided actions and connections.
 
         Args:
-            lin_velocity (float): Linear velocity to apply to the robot.
-            ang_velocity (float): Angular velocity to apply to the robot.
+            action (list): List of actions for each robot [[lin_vel, ang_vel], ...].
+            connection (Tensor): Tensor of shape (num_robots, num_robots-1) containing logits indicating connections between robots.
+            combined_weights (Tensor or None, optional): Optional weights for each connection, shape (num_robots, num_robots-1).
 
         Returns:
-            (tuple): Contains the latest LIDAR scan, distance to goal, cosine and sine of angle to goal,
-                   collision flag, goal reached flag, applied action, and computed reward.
+            tuple: (
+                poses (list): List of [x, y, theta] for each robot,
+                distances (list): Distance to goal for each robot,
+                coss (list): Cosine of angle to goal for each robot,
+                sins (list): Sine of angle to goal for each robot,
+                collisions (list): Collision status for each robot,
+                goals (list): Goal reached status for each robot,
+                action (list): Actions applied,
+                rewards (list): Rewards computed,
+                positions (list): Current [x, y] for each robot,
+                goal_positions (list): Goal [x, y] for each robot,
+            )
         """
         self.env.step(action_id=[i for i in range(self.num_robots)], action=action)
         self.env.render()
@@ -166,17 +180,27 @@ def reset(
         random_obstacle_ids=None,
     ):
         """
-        Reset the simulation environment, optionally setting robot and obstacle states.
+        Reset the simulation environment and optionally set robot and obstacle positions.
 
         Args:
-            robot_state (list or None): Initial state of the robot as a list of [x, y, theta, speed].
-            robot_goal (list or None): Goal state for the robot.
-            random_obstacles (bool): Whether to randomly reposition obstacles.
-            random_obstacle_ids (list or None): Specific obstacle IDs to randomize.
+            robot_state (list or None, optional): Initial state for robots as [x, y, theta, speed].
+            robot_goal (list or None, optional): Goal position(s) for the robots.
+            random_obstacles (bool, optional): If True, randomly position obstacles.
+            random_obstacle_ids (list or None, optional): IDs of obstacles to randomize.
 
         Returns:
-            (tuple): Initial observation after reset, including LIDAR scan, distance, cos/sin,
-                   and reward-related flags and values.
+            tuple: (
+                poses (list): List of [x, y, theta] for each robot,
+                distances (list): Distance to goal for each robot,
+                coss (list): Cosine of angle to goal for each robot,
+                sins (list): Sine of angle to goal for each robot,
+                collisions (list): All False after reset,
+                goals (list): All False after reset,
+                action (list): Initial action ([[0.0, 0.0], ...]),
+                rewards (list): Rewards for initial state,
+                positions (list): Initial [x, y] for each robot,
+                goal_positions (list): Initial goal [x, y] for each robot,
+            )
         """
         if robot_state is None:
             robot_state = [[random.uniform(3, 9)], [random.uniform(3, 9)], [0]]
@@ -254,18 +278,18 @@ def reset(
     @staticmethod
     def get_reward(goal, collision, action, closest_robots, distance, phase=1):
         """
-        Calculate the reward for the current step.
+        Calculate the reward for a robot given the current state and action.
 
         Args:
-            goal (bool): Whether the goal has been reached.
+            goal (bool): Whether the robot reached its goal.
             collision (bool): Whether a collision occurred.
-            action (list): The action taken [linear velocity, angular velocity].
-            closest_robots (list): Distances to the closest robots.
-            distance (float): Distance to goal.
-            phase (int, optional): Reward function phase. Defaults to 1.
+            action (list): [linear_velocity, angular_velocity] applied.
+            closest_robots (list): Distances to the closest other robots.
+            distance (float): Distance to the goal.
+            phase (int, optional): Reward phase/function selector (default: 1).
 
         Returns:
-            (float): Computed reward for the current state.
+            float: Computed reward.
         """
 
         match phase:
diff --git a/robot_nav/models/MARL/__init__.py b/robot_nav/models/MARL/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/robot_nav/models/MARL/hardsoftAttention.py b/robot_nav/models/MARL/hardsoftAttention.py
index 4978a9f..75b8b6c 100644
--- a/robot_nav/models/MARL/hardsoftAttention.py
+++ b/robot_nav/models/MARL/hardsoftAttention.py
@@ -5,7 +5,33 @@
 
 
 class Attention(nn.Module):
+    """
+    Multi-robot attention mechanism for learning hard and soft attentions.
+
+    This module provides both hard (binary) and soft (weighted) attention,
+    combining feature encoding, relative pose and goal geometry, and
+    message passing between agents.
+
+    Args:
+        embedding_dim (int): Dimension of the agent embedding vector.
+
+    Attributes:
+        embedding1 (nn.Linear): First layer for agent feature encoding.
+        embedding2 (nn.Linear): Second layer for agent feature encoding.
+        hard_mlp (nn.Sequential): MLP to process concatenated agent and edge features.
+        hard_encoding (nn.Linear): Outputs logits for hard (binary) attention.
+        q, k, v (nn.Linear): Layers for query, key, value projections for soft attention.
+        attn_score_layer (nn.Sequential): Computes unnormalized attention scores for each pair.
+        decode_1, decode_2 (nn.Linear): Decoding layers to produce the final attended embedding.
+    """
+
     def __init__(self, embedding_dim):
+        """
+        Initialize attention mechanism for multi-agent communication.
+
+        Args:
+            embedding_dim (int): Output embedding dimension per agent.
+        """
         super(Attention, self).__init__()
         self.embedding_dim = embedding_dim
 
@@ -41,15 +67,41 @@ def __init__(self, embedding_dim):
         nn.init.kaiming_uniform_(self.decode_2.weight, nonlinearity="leaky_relu")
 
     def encode_agent_features(self, embed):
+        """
+        Encode agent features using a small MLP.
+
+        Args:
+            embed (Tensor): Input features (B*N, 5).
+
+        Returns:
+            Tensor: Encoded embedding (B*N, embedding_dim).
+        """
         embed = F.leaky_relu(self.embedding1(embed))
         embed = F.leaky_relu(self.embedding2(embed))
         return embed
 
     def forward(self, embedding):
+        """
+        Forward pass: computes both hard and soft attentions among agents,
+        produces the attended embedding for each agent, as well as diagnostic info.
+
+        Args:
+            embedding (Tensor): Input tensor of shape (B, N, D), where D is at least 11.
+
+        Returns:
+            tuple:
+                att_embedding (Tensor): Final attended embedding, shape (B*N, 2*embedding_dim).
+                hard_logits (Tensor): Logits for hard attention, (B*N, N-1).
+                unnorm_rel_dist (Tensor): Pairwise distances between agents (not normalized), (B*N, N-1, 1).
+                mean_entropy (Tensor): Mean entropy of soft attention distributions.
+                hard_weights (Tensor): Binary hard attention mask, (B, N, N-1).
+                comb_w (Tensor): Final combined attention weights, (N, N*(N-1)).
+        """
         if embedding.dim() == 2:
             embedding = embedding.unsqueeze(0)
         batch_size, n_agents, _ = embedding.shape
 
+        # Extract sub-features
         embed = embedding[:, :, 4:9].reshape(batch_size * n_agents, -1)
         position = embedding[:, :, :2].reshape(batch_size, n_agents, 2)
         heading = embedding[:, :, 2:4].reshape(
@@ -57,14 +109,17 @@ def forward(self, embedding):
         )  # assume (cos(θ), sin(θ))
         action = embedding[:, :, 7:9].reshape(batch_size, n_agents, 2)
         goal = embedding[:, :, -2:].reshape(batch_size, n_agents, 2)
+
+        # Compute pairwise relative goal vectors (for each i,j)
         goal_j = goal.unsqueeze(1).expand(-1, n_agents, -1, -1)
         pos_i = position.unsqueeze(2)
         goal_rel_vec = goal_j - pos_i
 
+        # Encode agent features
         agent_embed = self.encode_agent_features(embed)
         agent_embed = agent_embed.view(batch_size, n_agents, self.embedding_dim)
 
-        # For hard attention
+        # Prep for hard attention: compute all relative geometry for each agent pair
         h_i = agent_embed.unsqueeze(2)  # (B, N, 1, D)
         pos_i = position.unsqueeze(2)  # (B, N, 1, 2)
         pos_j = position.unsqueeze(1)  # (B, 1, N, 2)
@@ -88,7 +143,7 @@ def forward(self, embedding):
         heading_j_cos = heading_j[..., 0]  # (B, 1, N)
         heading_j_sin = heading_j[..., 1]  # (B, 1, N)
 
-        # Stack edge features
+        # Edge features for hard attention
         edge_features = torch.cat(
             [
                 rel_dist,  # (B, N, N, 1)
@@ -101,7 +156,7 @@ def forward(self, embedding):
             dim=-1,
         )
 
-        # Broadcast h_i along N (for each pair)
+        # Broadcast agent embedding for all pairs (except self-pairs)
         h_i_expanded = h_i.expand(-1, -1, n_agents, -1)
 
         # Remove self-pairs using mask
@@ -129,13 +184,14 @@ def forward(self, embedding):
             batch_size * n_agents, n_agents - 1, 1
         )
 
-        # Soft attention
+        # ---- Soft attention computation ----
         q = self.q(agent_embed)
 
         attention_outputs = []
         entropy_list = []
         combined_w = []
 
+        # Goal-relative polar features for soft attention
         goal_rel_dist = torch.linalg.vector_norm(goal_rel_vec, dim=-1, keepdim=True)
         goal_angle_global = torch.atan2(goal_rel_vec[..., 1], goal_rel_vec[..., 0])
         heading_angle = torch.atan2(heading_i[..., 1], heading_i[..., 0])
@@ -147,6 +203,7 @@ def forward(self, embedding):
             [goal_rel_dist, goal_rel_angle_cos, goal_rel_angle_sin], dim=-1
         )
 
+        # Soft attention edge features (include goal polar)
         soft_edge_features = torch.cat([edge_features, goal_polar], dim=-1)
         for i in range(n_agents):
             q_i = q[:, i : i + 1, :]
@@ -159,10 +216,10 @@ def forward(self, embedding):
             q_i_expanded = q_i.expand(-1, n_agents - 1, -1)
             attention_input = torch.cat([q_i_expanded, k], dim=-1)
 
-            # Score computation
+            # Score computation (per pair)
             scores = self.attn_score_layer(attention_input).transpose(1, 2)
 
-            # Mask using hard weights
+            # Mask using hard attention
             h_weights = hard_weights[:, i].unsqueeze(1)
             mask = (h_weights > 0.5).float()
 
@@ -183,12 +240,12 @@ def forward(self, embedding):
             combined_weights = soft_weights * mask  # (B, 1, N-1)
             combined_w.append(combined_weights)
 
-            # Normalize combined_weights for entropy calculation
+            # Normalize for entropy calculation
             combined_weights_norm = combined_weights / (
                 combined_weights.sum(dim=-1, keepdim=True) + epsilon
             )
 
-            # Calculate entropy from combined_weights
+            # Entropy for analysis/logging
             entropy = (
                 -(combined_weights_norm * (combined_weights_norm + epsilon).log())
                 .sum(dim=-1)
diff --git a/robot_nav/models/MARL/marlTD3.py b/robot_nav/models/MARL/marlTD3.py
index 9fd4508..09b3122 100644
--- a/robot_nav/models/MARL/marlTD3.py
+++ b/robot_nav/models/MARL/marlTD3.py
@@ -11,6 +11,18 @@
 
 
 class Actor(nn.Module):
+    """
+    Policy network for MARL, with an attention mechanism for multi-robot coordination.
+
+    Args:
+        action_dim (int): Number of action dimensions.
+        embedding_dim (int): Dimensionality of agent feature embeddings.
+
+    Attributes:
+        attention (Attention): Encodes agent state and computes attention.
+        policy_head (nn.Sequential): MLP for mapping attention output to actions.
+    """
+
     def __init__(self, action_dim, embedding_dim):
         super().__init__()
         self.attention = Attention(embedding_dim)  # ➊ edge classifier
@@ -26,6 +38,16 @@ def __init__(self, action_dim, embedding_dim):
         )
 
     def forward(self, obs, detach_attn=False):
+        """
+        Forward pass through the actor.
+
+        Args:
+            obs (Tensor): Observation input of shape (batch, n_agents, obs_dim).
+            detach_attn (bool, optional): If True, detach attention output from computation graph.
+
+        Returns:
+            tuple: (action, hard_logits, pair_d, mean_entropy, hard_weights, combined_weights)
+        """
         attn_out, hard_logits, pair_d, mean_entropy, hard_weights, combined_weights = (
             self.attention(obs)
         )
@@ -36,6 +58,18 @@ def forward(self, obs, detach_attn=False):
 
 
 class Critic(nn.Module):
+    """
+    Critic (value) network for MARL, with twin Q-outputs and attention encoding.
+
+    Args:
+        action_dim (int): Number of action dimensions.
+        embedding_dim (int): Dimensionality of agent feature embeddings.
+
+    Attributes:
+        attention (Attention): Encodes agent state and computes attention.
+        (Other attributes are MLP layers for twin Q-networks.)
+    """
+
     def __init__(self, action_dim, embedding_dim):
         super(Critic, self).__init__()
         self.embedding_dim = embedding_dim
@@ -68,6 +102,21 @@ def __init__(self, action_dim, embedding_dim):
         torch.nn.init.kaiming_uniform_(self.layer_6.weight, nonlinearity="leaky_relu")
 
     def forward(self, embedding, action):
+        """
+        Forward pass through both Q-networks using attention on agent embeddings.
+
+        Args:
+            embedding (Tensor): Input agent embeddings (batch, n_agents, state_dim).
+            action (Tensor): Actions (batch * n_agents, action_dim).
+
+        Returns:
+            tuple: (Q1, Q2, mean_entropy, hard_logits, unnorm_rel_dist, hard_weights)
+                Q1, Q2 (Tensor): Twin Q-value estimates (batch * n_agents, 1)
+                mean_entropy (Tensor): Soft attention entropy (scalar).
+                hard_logits (Tensor): Hard attention logits (batch * n_agents, n_agents-1).
+                unnorm_rel_dist (Tensor): Unnormalized inter-agent distances.
+                hard_weights (Tensor): Hard attention weights (batch, n_agents, n_agents-1).
+        """
 
         (
             embedding_with_attention,
@@ -93,27 +142,24 @@ def forward(self, embedding, action):
 
 class TD3(object):
     """
-    CNNTD3 (Twin Delayed Deep Deterministic Policy Gradient with CNN-based inputs) agent for
-    continuous control tasks.
+    TD3 (Twin Delayed Deep Deterministic Policy Gradient) agent for multi-agent reinforcement learning.
 
-    This class encapsulates the full implementation of the TD3 algorithm using neural network
-    architectures for the actor and critic, with optional bounding for critic outputs to
-    regularize learning. The agent is designed to train in environments where sensor
-    observations (e.g., LiDAR) are used for navigation tasks.
+    Wraps actor and critic networks, optimizer setup, exploration, training, and saving/loading utilities.
 
     Args:
-        state_dim (int): Dimension of the input state.
-        action_dim (int): Dimension of the output action.
-        max_action (float): Maximum magnitude of the action.
-        device (torch.device): Torch device to use (CPU or GPU).
-        lr (float): Learning rate for both actor and critic optimizers.
-        save_every (int): Save model every N training iterations (0 to disable).
-        load_model (bool): Whether to load a pre-trained model at initialization.
-        save_directory (Path): Path to the directory for saving model checkpoints.
-        model_name (str): Base name for the saved model files.
-        load_directory (Path): Path to load model checkpoints from (if `load_model=True`).
-        use_max_bound (bool): Whether to apply maximum Q-value bounding during training.
-        bound_weight (float): Weight for the bounding loss term in total loss.
+        state_dim (int): State vector length per agent.
+        action_dim (int): Number of action dimensions.
+        max_action (float): Maximum action value for clipping.
+        device (torch.device): Torch device.
+        num_robots (int): Number of robots/agents.
+        lr_actor (float): Learning rate for actor optimizer.
+        lr_critic (float): Learning rate for critic optimizer.
+        save_every (int): Save model every N train iterations (0 = disable).
+        load_model (bool): If True, load model from checkpoint.
+        save_directory (Path): Path for saving model files.
+        model_name (str): Base name for saved models.
+        load_model_name (str or None): Name for loading saved model files.
+        load_directory (Path): Path for loading model files.
     """
 
     def __init__(
@@ -171,14 +217,17 @@ def __init__(
 
     def get_action(self, obs, add_noise):
         """
-        Selects an action for a given observation.
+        Computes an action (with optional exploration noise) for a given observation.
 
         Args:
-            obs (np.ndarray): The current observation/state.
-            add_noise (bool): Whether to add exploration noise to the action.
+            obs (np.ndarray): State vector (n_agents, state_dim) or batch.
+            add_noise (bool): Whether to add exploration noise.
 
         Returns:
-            (np.ndarray): The selected action.
+            tuple: (action, connection_logits, combined_weights)
+                action (np.ndarray): Action(s) (n_agents, action_dim).
+                connection_logits (Tensor): Hard attention logits.
+                combined_weights (Tensor): Final soft attention weights.
         """
         action, connection, combined_weights = self.act(obs)
         if add_noise:
@@ -193,10 +242,13 @@ def act(self, state):
         Computes the deterministic action from the actor network for a given state.
 
         Args:
-            state (np.ndarray): Input state.
+            state (np.ndarray): State (n_agents, state_dim).
 
         Returns:
-            (np.ndarray): Action predicted by the actor network.
+            tuple: (action, connection_logits, combined_weights)
+                action (np.ndarray): Action(s) (flattened).
+                connection_logits (Tensor): Hard attention logits.
+                combined_weights (Tensor): Final soft attention weights.
         """
         # Function to get the action from the actor
         state = torch.Tensor(state).to(self.device)
@@ -219,6 +271,25 @@ def train(
         entropy_weight=1,
         connection_proximity_threshold=4,
     ):
+        """
+        Runs a full TD3 training cycle using sampled experiences.
+
+        Args:
+            replay_buffer: Experience replay buffer.
+            iterations (int): Training steps.
+            batch_size (int): Batch size.
+            discount (float): Discount factor (gamma).
+            tau (float): Target network soft update factor.
+            policy_noise (float): Noise std for policy smoothing.
+            noise_clip (float): Max policy smoothing noise.
+            policy_freq (int): Frequency of actor/policy updates.
+            bce_weight (float): Loss weight for connection prediction BCE.
+            entropy_weight (float): Loss weight for attention entropy term.
+            connection_proximity_threshold (float): Threshold for true binary connection label.
+
+        Returns:
+            None
+        """
         av_Q = 0
         max_Q = -inf
         av_loss = 0
@@ -450,18 +521,21 @@ def prepare_state(
         self, poses, distance, cos, sin, collision, action, goal_positions
     ):
         """
-        Prepares the environment's raw agent state for learning.
+        Formats raw environment state for learning.
 
         Args:
             poses (list): Each agent's global pose [x, y, theta].
-            distance, cos, sin: Unused, can be removed or ignored.
+            distance (list): Distance to goal for each agent.
+            cos (list): Cosine of angle to goal.
+            sin (list): Sine of angle to goal.
             collision (list): Collision flags per agent.
             action (list): Last action taken [lin_vel, ang_vel].
             goal_positions (list): Each agent's goal [x, y].
 
         Returns:
-            states (list): List of processed state vectors.
-            terminal (list): Flags (1 if collision or goal reached, else 0).
+            tuple:
+                states (list): List of processed state vectors.
+                terminal (list): 1 if collision or goal reached, else 0.
         """
         states = []
         terminal = []
diff --git a/robot_nav/train_marl.py b/robot_nav/train_marl.py
index 52e4dfa..39f8f33 100644
--- a/robot_nav/train_marl.py
+++ b/robot_nav/train_marl.py
@@ -7,6 +7,15 @@
 
 
 def outside_of_bounds(poses):
+    """
+    Check if any robot is outside the defined world boundaries.
+
+    Args:
+        poses (list): List of [x, y, theta] poses for each robot.
+
+    Returns:
+        bool: True if any robot is outside the 21x21 area centered at (6, 6), else False.
+    """
     outside = False
     for pose in poses:
         norm_x = pose[0] - 6
@@ -19,6 +28,8 @@ def outside_of_bounds(poses):
 
 def main(args=None):
     """Main training function"""
+
+    # ---- Hyperparameters and setup ----
     action_dim = 2  # number of actions produced by the model
     max_action = 1  # maximum absolute value of output actions
     state_dim = 11  # number of input values in the neural network (vector length of state input)
@@ -40,6 +51,7 @@ def main(args=None):
     )
     save_every = 5  # save the model every n training cycles
 
+    # ---- Instantiate simulation environment and model ----
     sim = MARL_SIM(
         world_file="multi_robot_world.yaml", disable_plotting=False
     )  # instantiate environment
@@ -56,6 +68,7 @@ def main(args=None):
         load_model_name="phase1",
     )  # instantiate a model
 
+    # ---- Setup replay buffer and initial connections ----
     replay_buffer = get_buffer(
         model,
         sim,
@@ -69,12 +82,15 @@ def main(args=None):
         [[0.0 for _ in range(sim.num_robots - 1)] for _ in range(sim.num_robots)]
     )
 
+    # ---- Take initial step in environment ----
     poses, distance, cos, sin, collision, goal, a, reward, positions, goal_positions = (
         sim.step([[0, 0] for _ in range(sim.num_robots)], connections)
     )  # get the initial step state
     running_goals = 0
     running_collisions = 0
     running_timesteps = 0
+
+    # ---- Main training loop ----
     while epoch < max_epochs:  # train until max_epochs is reached
         state, terminal = model.prepare_state(
             poses, distance, cos, sin, collision, a, goal_positions