diff --git a/docs/api/IR-SIM/ir-marl-sim.md b/docs/api/IR-SIM/ir-marl-sim.md new file mode 100644 index 0000000..c576a23 --- /dev/null +++ b/docs/api/IR-SIM/ir-marl-sim.md @@ -0,0 +1,6 @@ +# MARL-IR-SIM + +::: robot_nav.SIM_ENV.marl_sim + options: + show_root_heading: true + show_source: true \ No newline at end of file diff --git a/docs/api/IR-SIM/ir-sim.md b/docs/api/IR-SIM/ir-sim.md index 1807be7..e236b00 100644 --- a/docs/api/IR-SIM/ir-sim.md +++ b/docs/api/IR-SIM/ir-sim.md @@ -1,6 +1,6 @@ # IR-SIM -::: robot_nav.sim +::: robot_nav.SIM_ENV.sim options: show_root_heading: true show_source: true \ No newline at end of file diff --git a/docs/api/models/MARL/Attention.md b/docs/api/models/MARL/Attention.md new file mode 100644 index 0000000..361fa59 --- /dev/null +++ b/docs/api/models/MARL/Attention.md @@ -0,0 +1,6 @@ +# Hard-Soft Attention + +::: robot_nav.models.MARL.hardsoftAttention + options: + show_root_heading: true + show_source: true diff --git a/docs/api/models/MARL/TD3.md b/docs/api/models/MARL/TD3.md new file mode 100644 index 0000000..ac99d61 --- /dev/null +++ b/docs/api/models/MARL/TD3.md @@ -0,0 +1,6 @@ +# MARL TD3 + +::: robot_nav.models.MARL.marlTD3 + options: + show_root_heading: true + show_source: true diff --git a/mkdocs.yml b/mkdocs.yml index fa58dd0..6eccc3c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,7 +4,9 @@ extra: nav: - Home: index.md - API Reference: - IR-SIM: api/IR-SIM/ir-sim + IR-SIM: + - SIM: api/IR-SIM/ir-sim + - MARL SIM: api/IR-SIM/ir-marl-sim Models: - DDPG: api/models/DDPG.md - TD3: api/models/TD3.md @@ -13,6 +15,9 @@ nav: - HCM: api/models/HCM.md - PPO: api/models/PPO.md - SAC: api/models/SAC.md + - MARL: + - HardSoft Attention: api/models/MARL/Attention + - TD3: api/models/MARL/TD3 Training: - Train: api/Training/train.md - Train RNN: api/Training/trainrnn.md diff --git a/robot_nav/SIM_ENV/marl_sim.py b/robot_nav/SIM_ENV/marl_sim.py index 8fd10ba..4016786 100644 --- a/robot_nav/SIM_ENV/marl_sim.py +++ b/robot_nav/SIM_ENV/marl_sim.py @@ -8,23 +8,26 @@ class MARL_SIM(SIM_ENV): """ - A simulation environment interface for robot navigation using IRSim in MARL setting. + Simulation environment for multi-agent robot navigation using IRSim. - This class wraps around the IRSim environment and provides methods for stepping, - resetting, and interacting with mobile robots, including reward computation. + This class extends the SIM_ENV and provides a wrapper for multi-robot + simulation and interaction, supporting reward computation and custom reset logic. Attributes: - env (object): The simulation environment instance from IRSim. - robot_goal (np.ndarray): The goal position of the robot. + env (object): IRSim simulation environment instance. + robot_goal (np.ndarray): Current goal position(s) for the robots. + num_robots (int): Number of robots in the environment. + x_range (tuple): World x-range. + y_range (tuple): World y-range. """ def __init__(self, world_file="multi_robot_world.yaml", disable_plotting=False): """ - Initialize the simulation environment. + Initialize the MARL_SIM environment. Args: - world_file (str): Path to the world configuration YAML file. - disable_plotting (bool): If True, disables rendering and plotting. + world_file (str, optional): Path to the world configuration YAML file. + disable_plotting (bool, optional): If True, disables IRSim rendering and plotting. """ display = False if disable_plotting else True self.env = irsim.make( @@ -38,15 +41,26 @@ def __init__(self, world_file="multi_robot_world.yaml", disable_plotting=False): def step(self, action, connection, combined_weights=None): """ - Perform one step in the simulation using the given control commands. + Perform a simulation step for all robots using the provided actions and connections. Args: - lin_velocity (float): Linear velocity to apply to the robot. - ang_velocity (float): Angular velocity to apply to the robot. + action (list): List of actions for each robot [[lin_vel, ang_vel], ...]. + connection (Tensor): Tensor of shape (num_robots, num_robots-1) containing logits indicating connections between robots. + combined_weights (Tensor or None, optional): Optional weights for each connection, shape (num_robots, num_robots-1). Returns: - (tuple): Contains the latest LIDAR scan, distance to goal, cosine and sine of angle to goal, - collision flag, goal reached flag, applied action, and computed reward. + tuple: ( + poses (list): List of [x, y, theta] for each robot, + distances (list): Distance to goal for each robot, + coss (list): Cosine of angle to goal for each robot, + sins (list): Sine of angle to goal for each robot, + collisions (list): Collision status for each robot, + goals (list): Goal reached status for each robot, + action (list): Actions applied, + rewards (list): Rewards computed, + positions (list): Current [x, y] for each robot, + goal_positions (list): Goal [x, y] for each robot, + ) """ self.env.step(action_id=[i for i in range(self.num_robots)], action=action) self.env.render() @@ -166,17 +180,27 @@ def reset( random_obstacle_ids=None, ): """ - Reset the simulation environment, optionally setting robot and obstacle states. + Reset the simulation environment and optionally set robot and obstacle positions. Args: - robot_state (list or None): Initial state of the robot as a list of [x, y, theta, speed]. - robot_goal (list or None): Goal state for the robot. - random_obstacles (bool): Whether to randomly reposition obstacles. - random_obstacle_ids (list or None): Specific obstacle IDs to randomize. + robot_state (list or None, optional): Initial state for robots as [x, y, theta, speed]. + robot_goal (list or None, optional): Goal position(s) for the robots. + random_obstacles (bool, optional): If True, randomly position obstacles. + random_obstacle_ids (list or None, optional): IDs of obstacles to randomize. Returns: - (tuple): Initial observation after reset, including LIDAR scan, distance, cos/sin, - and reward-related flags and values. + tuple: ( + poses (list): List of [x, y, theta] for each robot, + distances (list): Distance to goal for each robot, + coss (list): Cosine of angle to goal for each robot, + sins (list): Sine of angle to goal for each robot, + collisions (list): All False after reset, + goals (list): All False after reset, + action (list): Initial action ([[0.0, 0.0], ...]), + rewards (list): Rewards for initial state, + positions (list): Initial [x, y] for each robot, + goal_positions (list): Initial goal [x, y] for each robot, + ) """ if robot_state is None: robot_state = [[random.uniform(3, 9)], [random.uniform(3, 9)], [0]] @@ -254,18 +278,18 @@ def reset( @staticmethod def get_reward(goal, collision, action, closest_robots, distance, phase=1): """ - Calculate the reward for the current step. + Calculate the reward for a robot given the current state and action. Args: - goal (bool): Whether the goal has been reached. + goal (bool): Whether the robot reached its goal. collision (bool): Whether a collision occurred. - action (list): The action taken [linear velocity, angular velocity]. - closest_robots (list): Distances to the closest robots. - distance (float): Distance to goal. - phase (int, optional): Reward function phase. Defaults to 1. + action (list): [linear_velocity, angular_velocity] applied. + closest_robots (list): Distances to the closest other robots. + distance (float): Distance to the goal. + phase (int, optional): Reward phase/function selector (default: 1). Returns: - (float): Computed reward for the current state. + float: Computed reward. """ match phase: diff --git a/robot_nav/models/MARL/__init__.py b/robot_nav/models/MARL/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/robot_nav/models/MARL/hardsoftAttention.py b/robot_nav/models/MARL/hardsoftAttention.py index 4978a9f..75b8b6c 100644 --- a/robot_nav/models/MARL/hardsoftAttention.py +++ b/robot_nav/models/MARL/hardsoftAttention.py @@ -5,7 +5,33 @@ class Attention(nn.Module): + """ + Multi-robot attention mechanism for learning hard and soft attentions. + + This module provides both hard (binary) and soft (weighted) attention, + combining feature encoding, relative pose and goal geometry, and + message passing between agents. + + Args: + embedding_dim (int): Dimension of the agent embedding vector. + + Attributes: + embedding1 (nn.Linear): First layer for agent feature encoding. + embedding2 (nn.Linear): Second layer for agent feature encoding. + hard_mlp (nn.Sequential): MLP to process concatenated agent and edge features. + hard_encoding (nn.Linear): Outputs logits for hard (binary) attention. + q, k, v (nn.Linear): Layers for query, key, value projections for soft attention. + attn_score_layer (nn.Sequential): Computes unnormalized attention scores for each pair. + decode_1, decode_2 (nn.Linear): Decoding layers to produce the final attended embedding. + """ + def __init__(self, embedding_dim): + """ + Initialize attention mechanism for multi-agent communication. + + Args: + embedding_dim (int): Output embedding dimension per agent. + """ super(Attention, self).__init__() self.embedding_dim = embedding_dim @@ -41,15 +67,41 @@ def __init__(self, embedding_dim): nn.init.kaiming_uniform_(self.decode_2.weight, nonlinearity="leaky_relu") def encode_agent_features(self, embed): + """ + Encode agent features using a small MLP. + + Args: + embed (Tensor): Input features (B*N, 5). + + Returns: + Tensor: Encoded embedding (B*N, embedding_dim). + """ embed = F.leaky_relu(self.embedding1(embed)) embed = F.leaky_relu(self.embedding2(embed)) return embed def forward(self, embedding): + """ + Forward pass: computes both hard and soft attentions among agents, + produces the attended embedding for each agent, as well as diagnostic info. + + Args: + embedding (Tensor): Input tensor of shape (B, N, D), where D is at least 11. + + Returns: + tuple: + att_embedding (Tensor): Final attended embedding, shape (B*N, 2*embedding_dim). + hard_logits (Tensor): Logits for hard attention, (B*N, N-1). + unnorm_rel_dist (Tensor): Pairwise distances between agents (not normalized), (B*N, N-1, 1). + mean_entropy (Tensor): Mean entropy of soft attention distributions. + hard_weights (Tensor): Binary hard attention mask, (B, N, N-1). + comb_w (Tensor): Final combined attention weights, (N, N*(N-1)). + """ if embedding.dim() == 2: embedding = embedding.unsqueeze(0) batch_size, n_agents, _ = embedding.shape + # Extract sub-features embed = embedding[:, :, 4:9].reshape(batch_size * n_agents, -1) position = embedding[:, :, :2].reshape(batch_size, n_agents, 2) heading = embedding[:, :, 2:4].reshape( @@ -57,14 +109,17 @@ def forward(self, embedding): ) # assume (cos(θ), sin(θ)) action = embedding[:, :, 7:9].reshape(batch_size, n_agents, 2) goal = embedding[:, :, -2:].reshape(batch_size, n_agents, 2) + + # Compute pairwise relative goal vectors (for each i,j) goal_j = goal.unsqueeze(1).expand(-1, n_agents, -1, -1) pos_i = position.unsqueeze(2) goal_rel_vec = goal_j - pos_i + # Encode agent features agent_embed = self.encode_agent_features(embed) agent_embed = agent_embed.view(batch_size, n_agents, self.embedding_dim) - # For hard attention + # Prep for hard attention: compute all relative geometry for each agent pair h_i = agent_embed.unsqueeze(2) # (B, N, 1, D) pos_i = position.unsqueeze(2) # (B, N, 1, 2) pos_j = position.unsqueeze(1) # (B, 1, N, 2) @@ -88,7 +143,7 @@ def forward(self, embedding): heading_j_cos = heading_j[..., 0] # (B, 1, N) heading_j_sin = heading_j[..., 1] # (B, 1, N) - # Stack edge features + # Edge features for hard attention edge_features = torch.cat( [ rel_dist, # (B, N, N, 1) @@ -101,7 +156,7 @@ def forward(self, embedding): dim=-1, ) - # Broadcast h_i along N (for each pair) + # Broadcast agent embedding for all pairs (except self-pairs) h_i_expanded = h_i.expand(-1, -1, n_agents, -1) # Remove self-pairs using mask @@ -129,13 +184,14 @@ def forward(self, embedding): batch_size * n_agents, n_agents - 1, 1 ) - # Soft attention + # ---- Soft attention computation ---- q = self.q(agent_embed) attention_outputs = [] entropy_list = [] combined_w = [] + # Goal-relative polar features for soft attention goal_rel_dist = torch.linalg.vector_norm(goal_rel_vec, dim=-1, keepdim=True) goal_angle_global = torch.atan2(goal_rel_vec[..., 1], goal_rel_vec[..., 0]) heading_angle = torch.atan2(heading_i[..., 1], heading_i[..., 0]) @@ -147,6 +203,7 @@ def forward(self, embedding): [goal_rel_dist, goal_rel_angle_cos, goal_rel_angle_sin], dim=-1 ) + # Soft attention edge features (include goal polar) soft_edge_features = torch.cat([edge_features, goal_polar], dim=-1) for i in range(n_agents): q_i = q[:, i : i + 1, :] @@ -159,10 +216,10 @@ def forward(self, embedding): q_i_expanded = q_i.expand(-1, n_agents - 1, -1) attention_input = torch.cat([q_i_expanded, k], dim=-1) - # Score computation + # Score computation (per pair) scores = self.attn_score_layer(attention_input).transpose(1, 2) - # Mask using hard weights + # Mask using hard attention h_weights = hard_weights[:, i].unsqueeze(1) mask = (h_weights > 0.5).float() @@ -183,12 +240,12 @@ def forward(self, embedding): combined_weights = soft_weights * mask # (B, 1, N-1) combined_w.append(combined_weights) - # Normalize combined_weights for entropy calculation + # Normalize for entropy calculation combined_weights_norm = combined_weights / ( combined_weights.sum(dim=-1, keepdim=True) + epsilon ) - # Calculate entropy from combined_weights + # Entropy for analysis/logging entropy = ( -(combined_weights_norm * (combined_weights_norm + epsilon).log()) .sum(dim=-1) diff --git a/robot_nav/models/MARL/marlTD3.py b/robot_nav/models/MARL/marlTD3.py index 9fd4508..09b3122 100644 --- a/robot_nav/models/MARL/marlTD3.py +++ b/robot_nav/models/MARL/marlTD3.py @@ -11,6 +11,18 @@ class Actor(nn.Module): + """ + Policy network for MARL, with an attention mechanism for multi-robot coordination. + + Args: + action_dim (int): Number of action dimensions. + embedding_dim (int): Dimensionality of agent feature embeddings. + + Attributes: + attention (Attention): Encodes agent state and computes attention. + policy_head (nn.Sequential): MLP for mapping attention output to actions. + """ + def __init__(self, action_dim, embedding_dim): super().__init__() self.attention = Attention(embedding_dim) # ➊ edge classifier @@ -26,6 +38,16 @@ def __init__(self, action_dim, embedding_dim): ) def forward(self, obs, detach_attn=False): + """ + Forward pass through the actor. + + Args: + obs (Tensor): Observation input of shape (batch, n_agents, obs_dim). + detach_attn (bool, optional): If True, detach attention output from computation graph. + + Returns: + tuple: (action, hard_logits, pair_d, mean_entropy, hard_weights, combined_weights) + """ attn_out, hard_logits, pair_d, mean_entropy, hard_weights, combined_weights = ( self.attention(obs) ) @@ -36,6 +58,18 @@ def forward(self, obs, detach_attn=False): class Critic(nn.Module): + """ + Critic (value) network for MARL, with twin Q-outputs and attention encoding. + + Args: + action_dim (int): Number of action dimensions. + embedding_dim (int): Dimensionality of agent feature embeddings. + + Attributes: + attention (Attention): Encodes agent state and computes attention. + (Other attributes are MLP layers for twin Q-networks.) + """ + def __init__(self, action_dim, embedding_dim): super(Critic, self).__init__() self.embedding_dim = embedding_dim @@ -68,6 +102,21 @@ def __init__(self, action_dim, embedding_dim): torch.nn.init.kaiming_uniform_(self.layer_6.weight, nonlinearity="leaky_relu") def forward(self, embedding, action): + """ + Forward pass through both Q-networks using attention on agent embeddings. + + Args: + embedding (Tensor): Input agent embeddings (batch, n_agents, state_dim). + action (Tensor): Actions (batch * n_agents, action_dim). + + Returns: + tuple: (Q1, Q2, mean_entropy, hard_logits, unnorm_rel_dist, hard_weights) + Q1, Q2 (Tensor): Twin Q-value estimates (batch * n_agents, 1) + mean_entropy (Tensor): Soft attention entropy (scalar). + hard_logits (Tensor): Hard attention logits (batch * n_agents, n_agents-1). + unnorm_rel_dist (Tensor): Unnormalized inter-agent distances. + hard_weights (Tensor): Hard attention weights (batch, n_agents, n_agents-1). + """ ( embedding_with_attention, @@ -93,27 +142,24 @@ def forward(self, embedding, action): class TD3(object): """ - CNNTD3 (Twin Delayed Deep Deterministic Policy Gradient with CNN-based inputs) agent for - continuous control tasks. + TD3 (Twin Delayed Deep Deterministic Policy Gradient) agent for multi-agent reinforcement learning. - This class encapsulates the full implementation of the TD3 algorithm using neural network - architectures for the actor and critic, with optional bounding for critic outputs to - regularize learning. The agent is designed to train in environments where sensor - observations (e.g., LiDAR) are used for navigation tasks. + Wraps actor and critic networks, optimizer setup, exploration, training, and saving/loading utilities. Args: - state_dim (int): Dimension of the input state. - action_dim (int): Dimension of the output action. - max_action (float): Maximum magnitude of the action. - device (torch.device): Torch device to use (CPU or GPU). - lr (float): Learning rate for both actor and critic optimizers. - save_every (int): Save model every N training iterations (0 to disable). - load_model (bool): Whether to load a pre-trained model at initialization. - save_directory (Path): Path to the directory for saving model checkpoints. - model_name (str): Base name for the saved model files. - load_directory (Path): Path to load model checkpoints from (if `load_model=True`). - use_max_bound (bool): Whether to apply maximum Q-value bounding during training. - bound_weight (float): Weight for the bounding loss term in total loss. + state_dim (int): State vector length per agent. + action_dim (int): Number of action dimensions. + max_action (float): Maximum action value for clipping. + device (torch.device): Torch device. + num_robots (int): Number of robots/agents. + lr_actor (float): Learning rate for actor optimizer. + lr_critic (float): Learning rate for critic optimizer. + save_every (int): Save model every N train iterations (0 = disable). + load_model (bool): If True, load model from checkpoint. + save_directory (Path): Path for saving model files. + model_name (str): Base name for saved models. + load_model_name (str or None): Name for loading saved model files. + load_directory (Path): Path for loading model files. """ def __init__( @@ -171,14 +217,17 @@ def __init__( def get_action(self, obs, add_noise): """ - Selects an action for a given observation. + Computes an action (with optional exploration noise) for a given observation. Args: - obs (np.ndarray): The current observation/state. - add_noise (bool): Whether to add exploration noise to the action. + obs (np.ndarray): State vector (n_agents, state_dim) or batch. + add_noise (bool): Whether to add exploration noise. Returns: - (np.ndarray): The selected action. + tuple: (action, connection_logits, combined_weights) + action (np.ndarray): Action(s) (n_agents, action_dim). + connection_logits (Tensor): Hard attention logits. + combined_weights (Tensor): Final soft attention weights. """ action, connection, combined_weights = self.act(obs) if add_noise: @@ -193,10 +242,13 @@ def act(self, state): Computes the deterministic action from the actor network for a given state. Args: - state (np.ndarray): Input state. + state (np.ndarray): State (n_agents, state_dim). Returns: - (np.ndarray): Action predicted by the actor network. + tuple: (action, connection_logits, combined_weights) + action (np.ndarray): Action(s) (flattened). + connection_logits (Tensor): Hard attention logits. + combined_weights (Tensor): Final soft attention weights. """ # Function to get the action from the actor state = torch.Tensor(state).to(self.device) @@ -219,6 +271,25 @@ def train( entropy_weight=1, connection_proximity_threshold=4, ): + """ + Runs a full TD3 training cycle using sampled experiences. + + Args: + replay_buffer: Experience replay buffer. + iterations (int): Training steps. + batch_size (int): Batch size. + discount (float): Discount factor (gamma). + tau (float): Target network soft update factor. + policy_noise (float): Noise std for policy smoothing. + noise_clip (float): Max policy smoothing noise. + policy_freq (int): Frequency of actor/policy updates. + bce_weight (float): Loss weight for connection prediction BCE. + entropy_weight (float): Loss weight for attention entropy term. + connection_proximity_threshold (float): Threshold for true binary connection label. + + Returns: + None + """ av_Q = 0 max_Q = -inf av_loss = 0 @@ -450,18 +521,21 @@ def prepare_state( self, poses, distance, cos, sin, collision, action, goal_positions ): """ - Prepares the environment's raw agent state for learning. + Formats raw environment state for learning. Args: poses (list): Each agent's global pose [x, y, theta]. - distance, cos, sin: Unused, can be removed or ignored. + distance (list): Distance to goal for each agent. + cos (list): Cosine of angle to goal. + sin (list): Sine of angle to goal. collision (list): Collision flags per agent. action (list): Last action taken [lin_vel, ang_vel]. goal_positions (list): Each agent's goal [x, y]. Returns: - states (list): List of processed state vectors. - terminal (list): Flags (1 if collision or goal reached, else 0). + tuple: + states (list): List of processed state vectors. + terminal (list): 1 if collision or goal reached, else 0. """ states = [] terminal = [] diff --git a/robot_nav/train_marl.py b/robot_nav/train_marl.py index 52e4dfa..39f8f33 100644 --- a/robot_nav/train_marl.py +++ b/robot_nav/train_marl.py @@ -7,6 +7,15 @@ def outside_of_bounds(poses): + """ + Check if any robot is outside the defined world boundaries. + + Args: + poses (list): List of [x, y, theta] poses for each robot. + + Returns: + bool: True if any robot is outside the 21x21 area centered at (6, 6), else False. + """ outside = False for pose in poses: norm_x = pose[0] - 6 @@ -19,6 +28,8 @@ def outside_of_bounds(poses): def main(args=None): """Main training function""" + + # ---- Hyperparameters and setup ---- action_dim = 2 # number of actions produced by the model max_action = 1 # maximum absolute value of output actions state_dim = 11 # number of input values in the neural network (vector length of state input) @@ -40,6 +51,7 @@ def main(args=None): ) save_every = 5 # save the model every n training cycles + # ---- Instantiate simulation environment and model ---- sim = MARL_SIM( world_file="multi_robot_world.yaml", disable_plotting=False ) # instantiate environment @@ -56,6 +68,7 @@ def main(args=None): load_model_name="phase1", ) # instantiate a model + # ---- Setup replay buffer and initial connections ---- replay_buffer = get_buffer( model, sim, @@ -69,12 +82,15 @@ def main(args=None): [[0.0 for _ in range(sim.num_robots - 1)] for _ in range(sim.num_robots)] ) + # ---- Take initial step in environment ---- poses, distance, cos, sin, collision, goal, a, reward, positions, goal_positions = ( sim.step([[0, 0] for _ in range(sim.num_robots)], connections) ) # get the initial step state running_goals = 0 running_collisions = 0 running_timesteps = 0 + + # ---- Main training loop ---- while epoch < max_epochs: # train until max_epochs is reached state, terminal = model.prepare_state( poses, distance, cos, sin, collision, a, goal_positions