Update comments for params

Aldric PIERRAIN · Aldric PIERRAIN · commit 1a904f2c27f3 · 2025-09-08T14:19:42.000+02:00
diff --git a/dlib/cuda/tensor_tools.h b/dlib/cuda/tensor_tools.h
@@ -2406,12 +2406,19 @@ namespace dlib { namespace tt
         long feature_dim
     );
     /*!
-          requires
-              - halt_params.size() == feature_dim + 1 (weights + bias)
-              - input_data dimensions match batch_size x seq_len x ...
-          ensures
-              - halt_probs contains sigmoid(W_halt^T * input + b_halt) for each position
-              - logits contains the pre-sigmoid values
+        requires
+            - halt_params.size() == feature_dim + 1 (weights + bias)
+            - input_data.num_samples() == batch_size
+            - input_data.k() == num_channels where feature_dim = num_channels * d_model
+            - input_data.nr() == seq_len
+            - input_data.nc() == d_model
+        ensures
+            - Computes halting probabilities for Adaptive Computation Time:
+                - halt_probs contains sigmoid(W_halt^T * input + b_halt) for each position
+                - logits contains the pre-sigmoid values
+            - batch_size: number of samples in the batch
+            - seq_len: sequence length (number of positions to process)
+            - feature_dim: total feature dimension (num_channels × d_model)
     !*/
 
     void update_act_state(
@@ -2432,10 +2439,24 @@ namespace dlib { namespace tt
         requires
             - 0 < halt_threshold <= 1.0
             - current_step >= 0
-        ensures
-            - Updates ACT state for all positions
-            - Accumulates weighted outputs: output += α_t^n · input_data
-            - Updates cumulative_halting, remainders, and n_steps
+            - input_data.num_samples() == batch_size
+            - input_data.k() == num_channels
+            - input_data.nr() == seq_len
+            - input_data.nc() == d_model
+            - output has the same dimensions as input_data
+            - halt_probs.size() == batch_size * seq_len
+            - cumulative_halting.size() == remainders.size() == n_steps.size() == batch_size * seq_len
+        ensures
+            - Core ACT update step that accumulates weighted outputs:
+                - Updates ACT state for all positions
+                - Accumulates weighted outputs: output += α_t^n * input_data
+                - Updates cumulative_halting, remainders, and n_steps
+            - batch_size: number of samples in the batch
+            - seq_len: sequence length (number of positions to process)
+            - d_model: model dimension per channel
+            - num_channels: number of feature channels
+            - halt_threshold: halting threshold (typically 0.99)
+            - current_step: current computation step index (0-based)
     !*/
 
     void finalize_act_output(
@@ -2448,9 +2469,21 @@ namespace dlib { namespace tt
         long num_channels
     );
     /*!
+        requires
+            - input_data.num_samples() == batch_size
+            - input_data.k() == num_channels
+            - input_data.nr() == seq_len
+            - input_data.nc() == d_model
+            - output has the same dimensions as input_data
+            - remainders.size() == batch_size * seq_len
         ensures
-            - Adds final remainder contributions: output += ρ_t · input_data
-            - Applied only to positions with significant remainder (> 1e-6)
+            - Finalizes ACT output by adding remainder contributions:
+                - Adds final remainder contributions: output += ρ_t * input_data
+                - Applied only to positions with significant remainder (> 1e-6)
+            - batch_size: number of samples in the batch
+            - seq_len: sequence length (number of positions to process)
+            - d_model: model dimension per channel
+            - num_channels: number of feature channels
     !*/
 
     void apply_act_depth_scaling(
@@ -2466,9 +2499,21 @@ namespace dlib { namespace tt
     /*!
         requires
             - scale_factor >= 0
-        ensures
-            - Applies depth-dependent gradient scaling
-            - scale = 1 + scale_factor * (n_steps[pos] / max_steps)
+            - max_steps > 0
+            - gradients.num_samples() == batch_size
+            - gradients.k() == num_channels
+            - gradients.nr() == seq_len
+            - gradients.nc() == d_model
+            - n_steps.size() == batch_size * seq_len
+        ensures
+            - Applies gradient scaling based on computation depth:
+                - Applies depth-dependent gradient scaling
+                - scale = 1 + scale_factor * (n_steps[pos] / max_steps)
+            - seq_len: sequence length (number of positions to process)
+            - d_model: model dimension per channel
+            - num_channels: number of feature channels
+            - max_steps: maximum allowed computation steps
+            - scale_factor: scaling strength (0 = no scaling)
     !*/
 
 // ----------------------------------------------------------------------------------------