add td3 documentation

reiniscimurs · reiniscimurs · commit 10f460c5fe77 · 2025-04-10T10:47:23.000+02:00
diff --git a/robot_nav/models/TD3/TD3.py b/robot_nav/models/TD3/TD3.py
@@ -11,6 +11,23 @@
 
 
 class Actor(nn.Module):
+    """
+    Actor network for the TD3 algorithm.
+
+    This neural network maps states to actions using a feedforward architecture with
+    LeakyReLU activations and a final Tanh output to bound the actions in [-1, 1].
+
+    Architecture:
+        Input: state_dim
+        Hidden Layer 1: 400 units, LeakyReLU
+        Hidden Layer 2: 300 units, LeakyReLU
+        Output Layer: action_dim, Tanh
+
+    Args:
+        state_dim (int): Dimension of the input state.
+        action_dim (int): Dimension of the action output.
+    """
+
     def __init__(self, state_dim, action_dim):
         super(Actor, self).__init__()
 
@@ -22,13 +39,39 @@ def __init__(self, state_dim, action_dim):
         self.tanh = nn.Tanh()
 
     def forward(self, s):
+        """
+        Perform a forward pass through the actor network.
+
+        Args:
+            s (torch.Tensor): Input state tensor.
+
+        Returns:
+            torch.Tensor: Action output tensor after Tanh activation.
+        """
         s = F.leaky_relu(self.layer_1(s))
         s = F.leaky_relu(self.layer_2(s))
         a = self.tanh(self.layer_3(s))
         return a
 
 
 class Critic(nn.Module):
+    """
+    Critic network for the TD3 algorithm.
+
+    This class defines two Q-value estimators (Q1 and Q2) using separate subnetworks.
+    Each Q-network takes both state and action as input and outputs a scalar Q-value.
+
+    Architecture for each Q-network:
+        Input: state_dim and action_dim
+        - State pathway: Linear + LeakyReLU → 400 → 300
+        - Action pathway: Linear → 300
+        - Combined pathway: LeakyReLU(Linear(state) + Linear(action) + bias) → 1
+
+    Args:
+        state_dim (int): Dimension of the input state.
+        action_dim (int): Dimension of the input action.
+    """
+
     def __init__(self, state_dim, action_dim):
         super(Critic, self).__init__()
 
@@ -51,6 +94,18 @@ def __init__(self, state_dim, action_dim):
         torch.nn.init.kaiming_uniform_(self.layer_6.weight, nonlinearity="leaky_relu")
 
     def forward(self, s, a):
+        """
+        Perform a forward pass through both Q-networks.
+
+        Args:
+            s (torch.Tensor): Input state tensor.
+            a (torch.Tensor): Input action tensor.
+
+        Returns:
+            tuple:
+                - q1 (torch.Tensor): Output Q-value from the first critic network.
+                - q2 (torch.Tensor): Output Q-value from the second critic network.
+        """
         s1 = F.leaky_relu(self.layer_1(s))
         self.layer_2_s(s1)
         self.layer_2_a(a)
@@ -86,8 +141,28 @@ def __init__(
         use_max_bound=False,
         bound_weight=0.25,
     ):
-        # Initialize the Actor network
+        """
+        Twin Delayed Deep Deterministic Policy Gradient (TD3) agent.
+
+        This class implements the TD3 reinforcement learning algorithm for continuous control.
+        It uses an Actor-Critic architecture with target networks and delayed policy updates.
+
+        Args:
+            state_dim (int): Dimension of the input state.
+            action_dim (int): Dimension of the action space.
+            max_action (float): Maximum allowed value for actions.
+            device (torch.device): Device to run the model on (CPU or CUDA).
+            lr (float, optional): Learning rate for both actor and critic. Default is 1e-4.
+            save_every (int, optional): Save model every `save_every` iterations. Default is 0.
+            load_model (bool, optional): Whether to load model from checkpoint. Default is False.
+            save_directory (Path, optional): Directory to save model checkpoints.
+            model_name (str, optional): Name to use when saving/loading models.
+            load_directory (Path, optional): Directory to load model checkpoints from.
+            use_max_bound (bool, optional): Whether to apply maximum Q-value bounding during training.
+            bound_weight (float, optional): Weight for the max-bound loss penalty.
+        """
         self.device = device
+        # Initialize the Actor network
         self.actor = Actor(state_dim, action_dim).to(self.device)
         self.actor_target = Actor(state_dim, action_dim).to(self.device)
         self.actor_target.load_state_dict(self.actor.state_dict())
@@ -113,6 +188,16 @@ def __init__(
         self.bound_weight = bound_weight
 
     def get_action(self, obs, add_noise):
+        """
+        Get an action from the current policy with optional exploration noise.
+
+        Args:
+            obs (np.ndarray): The current state observation.
+            add_noise (bool): Whether to add exploration noise.
+
+        Returns:
+            np.ndarray: The chosen action clipped to [-max_action, max_action].
+        """
         if add_noise:
             return (
                 self.act(obs) + np.random.normal(0, 0.2, size=self.action_dim)
@@ -121,7 +206,15 @@ def get_action(self, obs, add_noise):
             return self.act(obs)
 
     def act(self, state):
-        # Function to get the action from the actor
+        """
+        Compute the action using the actor network without exploration noise.
+
+        Args:
+            state (np.ndarray): The current environment state.
+
+        Returns:
+            np.ndarray: The deterministic action predicted by the actor.
+        """
         state = torch.Tensor(state).to(self.device)
         return self.actor(state).cpu().data.numpy().flatten()
 
@@ -142,6 +235,24 @@ def train(
         distance_norm=10,
         time_step=0.3,
     ):
+        """
+        Train the TD3 agent using batches sampled from the replay buffer.
+
+        Args:
+            replay_buffer: The replay buffer to sample experiences from.
+            iterations (int): Number of training iterations to perform.
+            batch_size (int): Size of each mini-batch.
+            discount (float): Discount factor gamma for future rewards.
+            tau (float): Soft update rate for target networks.
+            policy_noise (float): Stddev of Gaussian noise added to target actions.
+            noise_clip (float): Maximum magnitude of noise added to target actions.
+            policy_freq (int): Frequency of policy (actor) updates.
+            max_lin_vel (float): Max linear velocity used for upper bound estimation.
+            max_ang_vel (float): Max angular velocity used for upper bound estimation.
+            goal_reward (float): Reward given for reaching the goal.
+            distance_norm (float): Distance normalization factor.
+            time_step (float): Time step used in upper bound calculations.
+        """
         av_Q = 0
         max_Q = -inf
         av_loss = 0
@@ -248,6 +359,13 @@ def train(
             self.save(filename=self.model_name, directory=self.save_directory)
 
     def save(self, filename, directory):
+        """
+        Save the actor and critic networks (and their targets) to disk.
+
+        Args:
+            filename (str): Name to use when saving model files.
+            directory (Path): Directory where models should be saved.
+        """
         Path(directory).mkdir(parents=True, exist_ok=True)
         torch.save(self.actor.state_dict(), "%s/%s_actor.pth" % (directory, filename))
         torch.save(
@@ -261,6 +379,13 @@ def save(self, filename, directory):
         )
 
     def load(self, filename, directory):
+        """
+        Load the actor and critic networks (and their targets) from disk.
+
+        Args:
+            filename (str): Name used when saving the models.
+            directory (Path): Directory where models are saved.
+        """
         self.actor.load_state_dict(
             torch.load("%s/%s_actor.pth" % (directory, filename))
         )
@@ -276,7 +401,26 @@ def load(self, filename, directory):
         print(f"Loaded weights from: {directory}")
 
     def prepare_state(self, latest_scan, distance, cos, sin, collision, goal, action):
-        # update the returned data from ROS into a form used for learning in the current model
+        """
+        Prepare the input state vector for training or inference.
+
+        Combines processed laser scan data, goal vector, and past action
+        into a normalized state input matching the input dimension.
+
+        Args:
+            latest_scan (list or np.ndarray): Laser scan data.
+            distance (float): Distance to goal.
+            cos (float): Cosine of the heading angle to goal.
+            sin (float): Sine of the heading angle to goal.
+            collision (bool): Whether a collision occurred.
+            goal (bool): Whether the goal has been reached.
+            action (list or np.ndarray): Last executed action [linear_vel, angular_vel].
+
+        Returns:
+            tuple:
+                - state (list): Prepared and normalized state vector.
+                - terminal (int): 1 if episode should terminate (goal or collision), else 0.
+        """
         latest_scan = np.array(latest_scan)
 
         inf_mask = np.isinf(latest_scan)