code cleanup and bug fixes

kinjalpatel27 · kinjalpatel27 · commit 169677c6471f · 2025-10-11T03:43:25.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -81,7 +81,7 @@ def max_calibrate(model: nn.Module, forward_loop: ForwardLoop | None = None, dis
         return
 
     def sync_quantizer_amax_across_dp_ep(quantizer, parallel_state):
-        """Synchronize the amax across all ranks in the data parallel and context parallel groups."""
+        """Synchronize the amax across all ranks in the data parallel and expert parallel groups."""
         if isinstance(quantizer, SequentialQuantizer):
             for _q in quantizer:
                 sync_quantizer_amax_across_dp_ep(_q, parallel_state)
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -52,37 +52,34 @@
 
 def sync_amax_across_sequential_mlp(model: torch.nn.Module):
     """Sync amax across experts in a SequentialMLP."""
-    amax_dict = {
-        "linear_fc1.input_quantizer": {},
-        "linear_fc1.weight_quantizer": {},
-        "linear_fc2.input_quantizer": {},
-        "linear_fc2.weight_quantizer": {},
-    }
-    # gather amax values from SequentialMLP experts
-    for name, module in model.named_modules():
+    amax_dict = {}
+
+    def get_sequential_mlp_expert_names(name: str, module: torch.nn.Module):
         if (
-            not isinstance(module, TensorQuantizer)
-            or not hasattr(module, "_amax")
-            or "local_experts" not in name
+            isinstance(module, TensorQuantizer)
+            and hasattr(module, "_amax")
+            and ".local_experts." in name
         ):
-            continue
-        expert_name, local_expert_name = name.split("local_experts")
-        for key in amax_dict:
-            if key in local_expert_name:
-                amax_dict[key][expert_name] = max(amax_dict[key].get(expert_name, 0), module.amax)
+            expert_name, local_expert_name = name.split(".local_experts.")
+            # extract quantizer name by removing local_expert number from the name
+            local_expert_name = ".".join(local_expert_name.split(".")[1:])
+            return expert_name, local_expert_name
+        return None, None
+
+    # gather amax values from SequentialMLP experts
+    for name, module in model.named_modules():
+        expert_name, local_expert_name = get_sequential_mlp_expert_names(name, module)
+        if expert_name and local_expert_name:
+            amax_dict[local_expert_name] = amax_dict.get(local_expert_name, {})
+            amax_dict[local_expert_name][expert_name] = max(
+                amax_dict[local_expert_name].get(expert_name, 0), module.amax
+            )
 
     # sync amax values across experts in SequentialMLP
     for name, module in model.named_modules():
-        if (
-            not isinstance(module, TensorQuantizer)
-            or not hasattr(module, "_amax")
-            or "local_experts" not in name
-        ):
-            continue
-        expert_name, local_expert_name = name.split("local_experts")
-        for key in amax_dict:
-            if key in local_expert_name:
-                module.amax = amax_dict[key][expert_name]
+        expert_name, local_expert_name = get_sequential_mlp_expert_names(name, module)
+        if expert_name and local_expert_name:
+            module.amax = amax_dict[local_expert_name][expert_name]
 
 
 CUSTOM_POST_CALIBRATION_PLUGINS.add(sync_amax_across_sequential_mlp)
@@ -523,6 +520,11 @@ def forward(self, input, *args, **kwargs):
 # Register the public te.pytorch.GroupedLinear class
 @QuantModuleRegistry.register({te_grouped_linear.GroupedLinear: "te_GroupedLinear"})
 class _QuantMegatronTEGroupedLinear(_MegatronParallelLinear):
+    _functionals_to_replace = [
+        (te_grouped_linear._GroupedLinear, "forward"),
+        (te_grouped_linear._GroupedLinear, "apply"),
+    ]
+
     def _setup(self):
         # GroupedMLP stores the weights as weight0, weight1, etc. To run setup in order to
         # initialize the quantizer states, self.weight is used to extract shape, dtype etc. Assigning
@@ -531,46 +533,17 @@ def _setup(self):
         # Memorize the original weight.dtype for modelopt_post_restore given that
         # the dtype can change later.
         super()._setup()
-        # Revert the weight to None after setup.
-        self.weight = None
-
-    @property
-    def functionals_to_replace(self):
-        original_forward = te_grouped_linear._GroupedLinear.forward
-
-        def te_grouped_quantized_linear_fn(ctx, inp, m_splits, *args):
-            num_gemms = len(m_splits)
-            weights_and_biases = args[-2 * num_gemms :]
-            weights, biases = weights_and_biases[:num_gemms], weights_and_biases[num_gemms:]
-            quantized_inputs = self.input_quantizer(inp)
-            quantized_weights = [self.weight_quantizer(weight) for weight in weights]
-
-            output = original_forward(
-                ctx,
-                quantized_inputs,
-                m_splits,
-                *args[: -2 * num_gemms],
-                *quantized_weights,
-                *biases,
-            )
-            return self.output_quantizer(output)
-
-        return [
-            (
-                te_grouped_linear._GroupedLinear,
-                "forward",
-                te_grouped_quantized_linear_fn,
-            ),
-        ]
+        # Remove self.weight after setup.
+        delattr(self, "weight")
 
     def modelopt_post_restore(self, prefix: str = ""):
         # GroupedMLP stores the weights as weight0, weight1, etc. To run post_restore in order to
         # initialize the quantizer states, self.weight is used to extract shape, dtype etc. Assigning
         # self.weight0 to self.weight to run the quantizer states initialization.
         self.weight = self.weight0
         super().modelopt_post_restore(prefix=prefix)
-        # Revert the weight to None after post_restore.
-        self.weight = None
+        # Remove self.weight after post_restore.
+        delattr(self, "weight")
 
     def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
         # _sharded_state_dict_grouped adds _extra_state{gemm_idx} for gemm_idx:[1, num_gemms] in
@@ -585,10 +558,34 @@ def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
         return super()._load_from_state_dict(filtered_state_dict, prefix, *args, **kwargs)
 
     def _process_quantizer_amax(self, k, v, quantizer_state_dict):
-        if v.ndim == 4:
-            quantizer_state_dict[k] = v.squeeze(1).squeeze(-1)
-        else:
-            quantizer_state_dict[k] = v.view(-1, 1) if v.numel() > 1 else v.view(-1)
+        assert v.numel() == 1, "TEGroupedLinear only supports per-tensor quantization"
+        quantizer_state_dict[k] = v.view(-1)
+
+    @staticmethod
+    def te_grouped_quantized_linear_fn(package, func_name, self, *args):
+        idx = 1 if func_name == "_forward" else 0
+        inp = args[idx]
+        num_gemms = len(args[idx + 1])
+        weights_and_biases = args[-2 * num_gemms :]
+        weights, biases = weights_and_biases[:num_gemms], weights_and_biases[num_gemms:]
+        quantized_inputs = self.input_quantizer(inp)
+        quantized_weights = [self.weight_quantizer(weight) for weight in weights]
+
+        output = getattr(package, func_name)(
+            *(
+                args[0],
+                quantized_inputs,
+            )
+            if func_name == "_forward"
+            else (quantized_inputs,),
+            *args[idx + 1 : -2 * num_gemms],
+            *quantized_weights,
+            *biases,
+        )
+        return self.output_quantizer(output)
+
+    # Override the quantized linear function
+    _quantized_linear_fn = te_grouped_quantized_linear_fn
 
 
 @QuantModuleRegistry.register(
@@ -614,42 +611,36 @@ class _MegatronTEGroupedRowParallelLinear(
 class _MegatronTEGroupedMLP(_MegatronMLP):
     def _setup(self):
         if not hasattr(self, "parallel_state") or self.parallel_state is None:
-            data_parallel_group = None
-            try:
-                data_parallel_group = get_data_parallel_group(with_context_parallel=True)
-            except AssertionError:
-                logger.warning(
-                    "Context parallel group is not initialized, using data parallel group"
-                )
-                data_parallel_group = get_data_parallel_group()
-
-            try:
-                expert_tensor_parallel_group = mcore_parallel.get_expert_tensor_parallel_group()
-            except AssertionError:
-                expert_tensor_parallel_group = None
             self.parallel_state = ParallelState(
-                data_parallel_group,
-                tensor_parallel_group=expert_tensor_parallel_group,
-                expert_model_parallel_group=mcore_parallel.get_expert_model_parallel_group(),
+                mcore_parallel.get_expert_data_parallel_group(check_initialized=False),
+                tensor_parallel_group=mcore_parallel.get_expert_tensor_parallel_group(
+                    check_initialized=False
+                ),
+                expert_model_parallel_group=mcore_parallel.get_expert_model_parallel_group(
+                    check_initialized=False
+                ),
             )
+        # initialize parallel state for submodules linear_fc1 and linear_fc2
+        self.linear_fc1.parallel_state = self.parallel_state
+        self.linear_fc2.parallel_state = self.parallel_state
 
 
 # Register the public megatron_moe.SequentialMLP class
 @QuantModuleRegistry.register({megatron_moe.SequentialMLP: "megatron_moe_SequentialMLP"})
 class _MegatronSequentialMLP(_MegatronMLP):
     def _setup(self):
         if not hasattr(self, "parallel_state") or self.parallel_state is None:
-            try:
-                data_parallel_group = mcore_parallel.get_expert_data_parallel_group()
-            except AssertionError:
-                data_parallel_group = None
-
-            try:
-                expert_tensor_parallel_group = mcore_parallel.get_expert_tensor_parallel_group()
-            except AssertionError:
-                expert_tensor_parallel_group = None
             self.parallel_state = ParallelState(
-                data_parallel_group,
-                tensor_parallel_group=expert_tensor_parallel_group,
-                expert_model_parallel_group=mcore_parallel.get_expert_model_parallel_group(),
+                mcore_parallel.get_expert_data_parallel_group(check_initialized=False),
+                tensor_parallel_group=mcore_parallel.get_expert_tensor_parallel_group(
+                    check_initialized=False
+                ),
+                expert_model_parallel_group=mcore_parallel.get_expert_model_parallel_group(
+                    check_initialized=False
+                ),
             )
+
+        # Initialize parallel state for submodules local_experts.*.linear_fc1 and local_experts.*.linear_fc2
+        for expert in self.local_experts:
+            expert.linear_fc1.parallel_state = self.parallel_state
+            expert.linear_fc2.parallel_state = self.parallel_state
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -588,6 +588,15 @@ def compare_amax_sync_across_expert_parallel(model):
 
             if quantizer_type not in expert_quantizers:
                 expert_quantizers[quantizer_type] = {}
+            if (
+                quantizer_type in expert_quantizers
+                and rank_idx in expert_quantizers[quantizer_type]
+            ):
+                # compare expert value across expert for sequential MoE
+                assert expert_quantizers[quantizer_type][rank_idx] == amax_val, (
+                    f"{rank_idx}, {quantizer_type}, expert_quantizers[quantizer_type][rank_idx]: "
+                    f"{expert_quantizers[quantizer_type][rank_idx]}, amax_val: {amax_val}"
+                )
             expert_quantizers[quantizer_type][rank_idx] = amax_val
 
     # Check synchronization - fail fast on first inconsistency
diff --git a/tests/gpu/torch/conftest.py b/tests/gpu/torch/conftest.py
@@ -40,6 +40,12 @@ def need_8_gpus():
         pytest.skip("Need at least 8 GPUs to run this test")
 
 
+@pytest.fixture
+def need_4_gpus():
+    if torch.cuda.device_count() < 4:
+        pytest.skip("Need at least 4 GPUs to run this test")
+
+
 @pytest.fixture(scope="module")
 def set_torch_dtype(request):
     orig_dtype = torch.get_default_dtype()
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -35,7 +35,6 @@
     auto_quantize_helper,
     data_tensor_context_parallel_test_helper,
     dp_cp_parallel_test_helper,
-    tensor_parallel_test_helper,
 )
 
 skip_if_no_megatron()
@@ -621,12 +620,10 @@ def test_fp8_real_quantize():
         mtq.NVFP4_DEFAULT_CFG,
     ],
 )
-@pytest.mark.parametrize("moe_grouped_gemm", [False, True])
-def test_moe_sharded_state_dict(tmp_path, config, moe_grouped_gemm):
+@pytest.mark.parametrize("moe_grouped_gemm", [True, False])
+def test_moe_sharded_state_dict(need_4_gpus, tmp_path, config, moe_grouped_gemm):
     size = torch.cuda.device_count()
     # TODO: Add support for compress=True for TEGroupedMLP
-    if size < 4:
-        pytest.skip("Requires at least 4 GPUs for expert parallel test")
     moe_config = {
         "tp_size": 2,
         "ep_size": 2,
@@ -720,13 +717,9 @@ def forward_fn(model):
     )
 
 
-def test_te_grouped_vs_sequential_quantize():
+def test_te_grouped_vs_sequential_quantize(need_4_gpus):
     """Test that TEGrouped and sequential MoE models produce similar quantized models."""
-
     size = torch.cuda.device_count()
-    if size < 4:
-        pytest.skip("Requires at least 4 GPUs for expert parallel test")
-
     spawn_multiprocess_job(
         size=size,
         job=partial(_test_te_grouped_vs_sequential_quantize_helper, 1, 2, 2),
@@ -763,7 +756,6 @@ def forward_fn(model):
 
     # quantize the model
     model = mtq.quantize(model, config, forward_fn)
-
     # Check initial sync status
     initial_sync, quantizer_type, rank_values = compare_amax_sync_across_expert_parallel(model)
     assert initial_sync, (
@@ -790,8 +782,10 @@ def forward_fn(model):
         "Consistent amax across expert parallel ranks, "
         "Amax should not be synchronized across expert parallel ranks since expert parallel is disabled"
     )
-    # Re-enable parallel groups and test synchronization
-    mtq.model_calib.max_calibrate(model, forward_fn)
+    # Re-calibrate the model and test synchronization
+    mtq.mode.wrapped_calib_func(
+        model, mtq.config.MaxCalibConfig(), forward_fn, mtq.model_calib.max_calibrate
+    )
 
     final_sync, quantizer_type, rank_values = compare_amax_sync_across_expert_parallel(model)
     assert final_sync, f"Inconsistent amax for expert {quantizer_type} across ranks: {rank_values}"