code cleanup

kinjalpatel27 · kinjalpatel27 · commit 22bfe0e9cb1c · 2025-10-07T22:49:53.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -37,7 +37,7 @@
 )
 from modelopt.torch.utils.distributed import ParallelState
 
-from ..nn import QuantModuleRegistry, SequentialQuantizer, TensorQuantizer
+from ..nn import QuantModuleRegistry, TensorQuantizer
 from ..nn.modules.quant_linear import RealQuantLinear, _QuantLinear
 from ..qtensor import QTensorWrapper
 from .custom import CUSTOM_MODEL_PLUGINS, _ParallelLinear
@@ -501,7 +501,6 @@ def _setup(self):
         self.parallel_state = ParallelState(
             data_parallel_group,
             mcore_parallel.get_tensor_model_parallel_group(),
-            mcore_parallel.get_context_parallel_group(),
             mcore_parallel.get_expert_model_parallel_group(),
             expert_tensor_parallel_group,
         )
@@ -544,70 +543,13 @@ def te_grouped_quantized_linear_fn(ctx, inp, m_splits, *args):
         ]
 
     def modelopt_post_restore(self, prefix: str = ""):
-        """Post restore to correctly configure the TensorQuantizer states for MCore/distributed frameworks.
-
-        ModelOpt restores the TensorQuantizer states such as `_amax` and `_pre_quant_scale` to their
-        shape before saving. However this is not enough for MCore/distributed frameworks since the tensor parallelism
-        could change between saving and restoring. If the tensor parallelism changes, the shape of the quantizer
-        states also changes. So we need to re-calculate the quantizer states.
-        """
-        from modelopt.torch.quantization.model_calib import max_calibrate
-
-        def _check_unsupported_states(quantizer: TensorQuantizer):
-            for k in quantizer.state_dict():
-                if k not in ["_amax", "_pre_quant_scale"]:
-                    warnings.warn(
-                        f"Restore of {k} for {prefix} is not supported. The restore of this layer might be "
-                        f"incorrect. Please implement a custom restore for {k}."
-                    )
-
-        def _has_state(quantizer, name):
-            # Handling for SequentialQuantizer
-            quantizer = quantizer[0] if isinstance(quantizer, SequentialQuantizer) else quantizer
-            return hasattr(quantizer, name)
-
-        # weights for TEGroupedLinear are stored in weight0, weight1, etc.
-        if self.weight0 is None:
-            return
-        for quantizer in [self.weight_quantizer, self.input_quantizer, self.output_quantizer]:
-            _check_unsupported_states(
-                quantizer if isinstance(quantizer, TensorQuantizer) else quantizer[0]
-            )
-            if _has_state(self.weight_quantizer, "_amax"):
-                self.weight_quantizer.reset_amax()
-            for i in range(self.num_gemms):
-                weight = getattr(self, f"weight{i}")
-                assert weight is not None, "weight is None"
-
-                max_calibrate(self.weight_quantizer, lambda wq: wq(weight), distributed_sync=False)
-            if _has_state(self.input_quantizer, "_pre_quant_scale"):
-                if hasattr(self.input_quantizer, "_pre_quant_scale"):
-                    delattr(self.input_quantizer, "_pre_quant_scale")
-                pqs = torch.zeros(
-                    (weight.shape[1]), device=weight.device, dtype=self.original_weight_dtype
-                )
-                self.input_quantizer.register_buffer("_pre_quant_scale", pqs)
-
-        if _has_state(self.input_quantizer, "_amax"):
-            self.input_quantizer.reset_amax()
-            dummy_input = torch.ones(
-                (1, 1, self.weight0.shape[1]),
-                device=self.weight0.device,
-                dtype=self.original_weight_dtype,
-            )
-            max_calibrate(self.input_quantizer, lambda iq: iq(dummy_input), distributed_sync=False)
-        if _has_state(self.output_quantizer, "_amax"):
-            self.output_quantizer.reset_amax()
-            dummy_input = torch.ones(
-                (1, 1, self.weight0.shape[0]),
-                device=self.weight0.device,
-                dtype=self.original_weight_dtype,
-            )
-            max_calibrate(self.output_quantizer, lambda oq: oq(dummy_input), distributed_sync=False)
-            # If there are any other states, lets move them to the correct device
-
-        self.weight = None
+        # GroupedMLP stores the weights as weight0, weight1, etc. To run post_restore in order to
+        # initialize the quantizer states, self.weight is used to extract shape, dtype etc. Assigning
+        # self.weight0 to self.weight to run the quantizer states initialization.
+        self.weight = self.weight0
         super().modelopt_post_restore(prefix=prefix)
+        # Revert the weight to None after post_restore to avoid the weight being None during forward pass.
+        self.weight = None
 
     def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
         # _sharded_state_dict_grouped adds _extra_state{gemm_idx} for gemm_idx:[1, num_gemms] in
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -429,7 +429,6 @@ def initialize_for_megatron(
         context_parallel_size=context_parallel_size,
         expert_tensor_parallel_size=expert_tensor_parallel_size,
         expert_model_parallel_size=expert_model_parallel_size,
-        order="tp-ep-dp-pp",
     )
     model_parallel_cuda_manual_seed(seed)
 
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -549,7 +549,7 @@ def test_moe_sharded_state_dict(need_8_gpus, tmp_path, config):
     )
 
 
-def _test_grouped_vs_non_grouped_amax_helper(tp_size, ep_size, etp_size, rank, size):
+def _test_grouped_vs_non_grouped_quantize_helper(tp_size, ep_size, etp_size, rank, size):
     """Test that grouped and non-grouped MoE models produce similar amax values."""
     initialize_for_megatron(
         tensor_model_parallel_size=tp_size,
@@ -615,8 +615,8 @@ def forward_fn(model):
     assert output_comparison_after, "Outputs are not close after quantization"
 
 
-def test_grouped_vs_non_grouped_amax():
-    """Test that grouped and non-grouped MoE models produce similar amax values."""
+def test_grouped_vs_non_grouped_quantize():
+    """Test that grouped and non-grouped MoE models produce similar quantized models."""
     import time
 
     size = torch.cuda.device_count()
@@ -627,14 +627,22 @@ def test_grouped_vs_non_grouped_amax():
     time.sleep(0.1)
 
     spawn_multiprocess_job(
-        size=size, job=partial(_test_grouped_vs_non_grouped_amax_helper, 1, 2, 2), backend="nccl"
+        size=size,
+        job=partial(_test_grouped_vs_non_grouped_quantize_helper, 1, 2, 2),
+        backend="nccl",
     )
 
 
-def _test_expert_model_parallel_amax_sync(ep_size, etp_size, moe_grouped_gemm):
-    """
-    Test that demonstrates the requirement for expert parallel sync in model_calib.py
-    """
+def _test_expert_model_parallel_amax_sync(ep_size, etp_size, moe_grouped_gemm, rank, size):
+    """Test expert parallel synchronization with different configurations."""
+    initialize_for_megatron(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        expert_model_parallel_size=ep_size,
+        expert_tensor_parallel_size=etp_size,
+        seed=SEED,
+    )
+
     # Create model with expert parallelism
     model = _gpt_model_provider(
         tp_size=1,
@@ -664,7 +672,7 @@ def forward_fn(model):
     )
 
     # Create inconsistent amax values
-    rank = torch.distributed.get_rank()
+    cur_rank = torch.distributed.get_rank()
     for name, module in model.named_modules():
         if isinstance(module, mtq.nn.TensorQuantizer):
             # Check if this is an expert quantizer
@@ -675,7 +683,7 @@ def forward_fn(model):
 
             if is_expert_quantizer and hasattr(module, "_amax"):
                 # Create rank-specific amax values to simulate missing sync
-                rank_offset = rank * 0.1
+                rank_offset = cur_rank * 0.1
                 module.amax = module.amax + rank_offset
 
     # Determine expert parallel type
@@ -703,21 +711,6 @@ def forward_fn(model):
     )
 
 
-def _test_expert_parallel_sync_helper(ep_size, etp_size, moe_grouped_gemm, rank, size):
-    """Test expert parallel synchronization with different configurations."""
-    initialize_for_megatron(
-        tensor_model_parallel_size=1,
-        pipeline_model_parallel_size=1,
-        context_parallel_size=1,
-        expert_model_parallel_size=ep_size,
-        expert_tensor_parallel_size=etp_size,
-        seed=42 + rank,
-    )
-
-    # Run the actual test
-    _test_expert_model_parallel_amax_sync(ep_size, etp_size, moe_grouped_gemm)
-
-
 @pytest.mark.parametrize(("ep_size", "etp_size"), [(1, 2), (2, 1), (2, 2)])
 @pytest.mark.parametrize("moe_grouped_gemm", [True, False])
 def test_expert_parallel_sync(need_4_gpus, ep_size, etp_size, moe_grouped_gemm):
@@ -734,6 +727,6 @@ def test_expert_parallel_sync(need_4_gpus, ep_size, etp_size, moe_grouped_gemm):
 
     spawn_multiprocess_job(
         size=total_size,
-        job=partial(_test_expert_parallel_sync_helper, ep_size, etp_size, moe_grouped_gemm),
+        job=partial(_test_expert_model_parallel_amax_sync, ep_size, etp_size, moe_grouped_gemm),
         backend="nccl",
     )

Original file line number	Diff line number	Diff line change
`@@ -429,7 +429,6 @@ def initialize_for_megatron(`
`429`	`429`	`context_parallel_size=context_parallel_size,`
`430`	`430`	`expert_tensor_parallel_size=expert_tensor_parallel_size,`
`431`	`431`	`expert_model_parallel_size=expert_model_parallel_size,`
`432`		`- order="tp-ep-dp-pp",`
`433`	`432`	`)`
`434`	`433`	`model_parallel_cuda_manual_seed(seed)`
`435`	`434`