📝 Add docstrings to kaix/kvcache_mcore

coderabbitai[bot] · web-flow · commit 9784508edba2 · 2025-09-25T21:45:58.000Z
Docstrings generation was requested by @kaix-nv. * #375 (comment) The following files were modified: * `modelopt/torch/quantization/plugins/megatron.py` * `tests/gpu/torch/quantization/plugins/test_megatron.py`
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -461,6 +461,14 @@ class _RealQuantMegatronRowParallelLinear(
     _scale_tensor_shard_axis = 1
 
     def forward(self, input, *args, **kwargs):
+        """
+        Compute the forward pass using the row-parallel linear implementation.
+        
+        Forwards all positional and keyword arguments to the row-parallel parent implementation.
+        
+        Returns:
+            torch.Tensor: The output activations produced by the linear layer.
+        """
         return _MegatronRowParallelLinear.forward(self, input, *args, **kwargs)
 
 
@@ -469,13 +477,19 @@ class _QuantTEDotProductAttention(QuantModule):
     """Quantized version of TEDotProductAttention for Megatron models with KV cache quantization."""
 
     def _setup(self):
-        """Initialize quantizers for Q, K, V tensors."""
+        """
+        Create and attach three TensorQuantizer instances as q_bmm_quantizer, k_bmm_quantizer, and v_bmm_quantizer for quantizing query, key, and value tensors.
+        """
         self.q_bmm_quantizer = TensorQuantizer()
         self.k_bmm_quantizer = TensorQuantizer()
         self.v_bmm_quantizer = TensorQuantizer()
 
     def _calibrate_quantizers(self):
-        """Calibrate quantizers with minimal dummy tensors."""
+        """
+        Calibrate the module's Q/K/V tensor quantizers using minimal dummy inputs.
+        
+        Creates a tiny float16 dummy tensor shaped according to the attention QKV layout (either "sbhd" or "bshd", determined from self.config.apply_rope_fusion) and uses it to compute and store `_amax` values for any enabled q_bmm_quantizer, k_bmm_quantizer, or v_bmm_quantizer that does not yet have an `_amax`. Calibration is performed only for quantizers that are enabled and lack existing scale information.
+        """
         # Get device from parent module parameters
         device = next(self.parameters()).device if self.parameters() else torch.device("cuda")
 
@@ -518,10 +532,16 @@ def _calibrate_quantizers(self):
                     max_calibrate(quantizer, lambda q: q(dummy_tensor), distributed_sync=False)
 
     def forward(self, query, key, value, *args, **kwargs):
-        """Apply post-RoPE quantization to KV cache.
-
-        TEDotProductAttention receives Q, K, V after RoPE is applied,
-        so we quantize them directly for KV cache quantization.
+        """
+        Quantize the provided query, key, and value tensors for KV-cache and forward them to the base attention implementation.
+        
+        Parameters:
+            query (Tensor): Query tensor (already rotated by RoPE) to be quantized and used for attention.
+            key (Tensor): Key tensor (already rotated by RoPE) to be quantized and used for attention.
+            value (Tensor): Value tensor to be quantized and used for attention.
+        
+        Returns:
+            The output of the parent attention `forward` called with the quantized query, key, and value.
         """
         # Quantize Q, K, V
         query = self.q_bmm_quantizer(query)
@@ -531,7 +551,20 @@ def forward(self, query, key, value, *args, **kwargs):
         return super().forward(query, key, value, *args, **kwargs)
 
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
-        """Create a sharded state dictionary for distributed checkpointing."""
+        """
+        Builds a sharded state dictionary containing non-quantizer parameters and bmm-quantizer state for distributed checkpointing.
+        
+        Parameters:
+        	prefix (str): Key prefix to prepend to returned state keys.
+        	sharded_offsets (tuple): Offsets describing shard positions for sharded tensors (passed to make_sharded_tensors_for_checkpoint).
+        	metadata: Ignored by this implementation (kept for API compatibility).
+        
+        Returns:
+        	state_dict (dict): Mapping from checkpoint keys to tensors, including:
+        		- Non-quantizer module tensors (prefixed).
+        		- Per-quantizer `_amax` entries for q/k/v bmm quantizers when present.
+        		- Other quantizer tensors processed into sharded tensors via the checkpoint helper.
+        """
         sharded_state_dict = {}
 
         # First add non-quantizer parameters
@@ -566,7 +599,18 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         return sharded_state_dict
 
     def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        """Handle loading state dict for quantizers."""
+        """
+        Adjust quantizer entries in a loaded state dict to match this module's expected keys and tensor shapes before delegating to the parent loader.
+        
+        This method:
+        - Renames per-quantizer `_amax` keys from `{prefix}{quantizer_name}._amax` to `{prefix}{quantizer_name}._amax`'s expected TensorQuantizer key format (`{prefix}{quantizer_name}._amax` -> `{prefix}{quantizer_name}._amax` mapped to `{prefix}{quantizer_name}._amax` as `_amax` is remapped to `_{quantizer_name}_amax` format expected by the local TensorQuantizer).
+        - Reshapes any remaining quantizer state tensors (keys containing `_quantizer` but not `_amax`) to match the corresponding tensor shapes in this module's `state_dict`.
+        - Calls the superclass `_load_from_state_dict` with the adjusted `state_dict`.
+        
+        Parameters:
+            state_dict (dict): The incoming state dictionary being loaded; modified in-place to align quantizer keys and shapes.
+            prefix (str): The prefix applied to keys for this module in `state_dict`.
+        """
         for quantizer_name in ["q_bmm_quantizer", "k_bmm_quantizer", "v_bmm_quantizer"]:
             full_prefix = f"{prefix}{quantizer_name}."
             amax_key = f"{prefix}{quantizer_name}._amax"
@@ -586,10 +630,25 @@ def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
 
     def modelopt_post_restore(self, name=""):
-        """Restore quantizer states after model loading."""
+        """
+        Perform post-restore validation for attention quantizers and trigger calibration if needed.
+        
+        Checks each of the instance's Q/K/V BMM quantizers (if present and enabled) for unsupported saved state keys and emits a warning identifying the provided `name` when such keys are found. If any enabled quantizer lacks a stored `_amax` value, schedules and runs quantizer calibration by calling self._calibrate_quantizers().
+        
+        Parameters:
+            name (str): Human-readable identifier for the module being restored; included in warning messages to help locate the layer.
+        """
         super().modelopt_post_restore(name)
 
         def _check_unsupported_states(quantizer):
+            """
+            Check a quantizer's saved state keys and warn about any unsupported entries.
+            
+            Inspects quantizer.state_dict() (if present) and emits a warning for each key other than `_amax` and `_pre_quant_scale` indicating that restoring that key is not supported.
+            
+            Parameters:
+                quantizer: An object with a `state_dict()` method (typically a TensorQuantizer) whose saved state keys will be validated.
+            """
             if not hasattr(quantizer, "state_dict"):
                 return
 
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -367,12 +367,26 @@ def forward_fn(model):
 
 
 def test_fp8_real_quantize():
+    """
+    Runs the FP8 real quantization memory reduction test across all available CUDA devices.
+    
+    Spawns a multiprocess job (NCCL backend) that executes _test_fp8_real_quantize_helper on each detected GPU.
+    """
     size = torch.cuda.device_count()
     spawn_multiprocess_job(size=size, job=_test_fp8_real_quantize_helper, backend="nccl")
 
 
 def _test_kv_cache_quant_helper(config, rank, size):
-    """Helper function for testing KV cache quantization with TEDotProductAttention."""
+    """
+    Verify that TEDotProductAttention modules receive KV-cache quantization and remain functional after quantization.
+    
+    Quantizes a minimal GPT model (built with transformer_impl="modelopt") using the provided `config`, checks that each TEDotProductAttention-like module exposes `k_bmm_quantizer` and `v_bmm_quantizer`, asserts those quantizers are enabled, and performs a smoke forward pass to ensure the quantized model runs.
+    
+    Parameters:
+        config: Quantization configuration used to quantize the model (e.g., FP8_KV_CFG or NVFP4_KV_CFG).
+        rank (int): Process rank in the distributed test invocation.
+        size (int): Tensor model parallel size used to construct the model.
+    """
     initialize_for_megatron(
         tensor_model_parallel_size=size, pipeline_model_parallel_size=1, seed=SEED
     )
@@ -392,6 +406,15 @@ def _test_kv_cache_quant_helper(config, rank, size):
     prompt_tokens = torch.randint(0, model.vocab_size, (2, model.max_sequence_length)).cuda()
 
     def forward_fn(model):
+        """
+        Run megatron_prefill with the predefined `prompt_tokens` on the given model.
+        
+        Parameters:
+            model: The Megatron model to execute the prefill pass on.
+        
+        Returns:
+            The outputs produced by `megatron_prefill` for the provided model.
+        """
         return megatron_prefill(model, prompt_tokens)
 
     # Test KV cache quantization with the given config
@@ -418,7 +441,17 @@ def forward_fn(model):
 
 
 def _test_kv_cache_sharded_state_dict_helper(tmp_path, config, rank, size):
-    """Helper for testing KV cache quantization with sharded state dict save/load."""
+    """
+    Validate that KV-cache quantizers on TEDotProductAttention modules are created and correctly preserved across sharded state_dict save/load.
+    
+    This helper initializes Megatron, constructs two small GPT models using transformer_impl="modelopt" (TEDotProductAttention), quantizes both models with the provided config (KV quantizers must exist on both before checkpointing), runs a sharded save/load roundtrip, and asserts that k_bmm_quantizer and v_bmm_quantizer instances are present and that their internal `_amax` tensors (when present) match between the reference and restored models.
+    
+    Parameters:
+        tmp_path (pathlib.Path): Temporary directory to write sharded checkpoints.
+        config (dict): Quantization configuration dictionary to use for mtq.quantize.
+        rank (int): Distributed process rank for this helper.
+        size (int): Tensor-model-parallel size / world size used to initialize Megatron.
+    """
     # Disable output_layer quantization (same as other sharded state dict tests)
     config["quant_cfg"]["*output_layer*"] = {"enable": False}
 
@@ -450,6 +483,15 @@ def _test_kv_cache_sharded_state_dict_helper(tmp_path, config, rank, size):
     ).cuda()
 
     def forward_fn(model):
+        """
+        Run megatron_prefill with the predefined `prompt_tokens` on the given model.
+        
+        Parameters:
+            model: The Megatron model to execute the prefill pass on.
+        
+        Returns:
+            The outputs produced by `megatron_prefill` for the provided model.
+        """
         return megatron_prefill(model, prompt_tokens)
 
     # Quantize the reference model
@@ -519,13 +561,10 @@ def forward_fn(model):
     ],
 )
 def test_kv_cache_quant(config):
-    """Verify KV cache quantization works correctly with TEDotProductAttention.
-
-    This test ensures TEDotProductAttention is properly registered and gets the
-    expected q/k/v_bmm_quantizers when using KV cache configs.
-
-    Note: This test requires Transformer Engine to be installed since TEDotProductAttention
-    is only available with transformer_impl="modelopt" or "transformer_engine" (not "local").
+    """
+    Verify that TEDotProductAttention modules gain the expected KV-cache quantizers when using KV cache configurations.
+    
+    Runs the KV-cache quantization smoke test (via a single-process multiprocess spawn) and requires Transformer Engine or modelopt since TEDotProductAttention is not available with the local transformer implementation.
     """
     spawn_multiprocess_job(size=1, job=partial(_test_kv_cache_quant_helper, config), backend="nccl")
 
@@ -538,11 +577,10 @@ def test_kv_cache_quant(config):
     ],
 )
 def test_kv_cache_sharded_state_dict(tmp_path, config):
-    """Test KV cache quantization with sharded state dict save/load.
-
-    This test verifies the complete workflow of saving and loading KV cache quantized
-    models with distributed checkpointing, ensuring quantizer states are properly
-    preserved across the save/load cycle.
+    """
+    Run a sharded-state-dict save/load test for KV-cache quantized models.
+    
+    Spawns up to 2 processes using NCCL to execute the sharded-state-dict helper with the provided temporary path and quantization config.
     """
     size = min(2, torch.cuda.device_count())  # Use 2 GPUs if available, else 1
     spawn_multiprocess_job(