Code cleanup

kinjalpatel27 · kinjalpatel27 · commit 23daf38254c8 · 2025-10-13T20:32:16.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -611,13 +611,9 @@ class _MegatronTEGroupedMLP(_MegatronMLP):
     def _setup(self):
         if not hasattr(self, "parallel_state") or self.parallel_state is None:
             self.parallel_state = ParallelState(
-                mcore_parallel.get_expert_data_parallel_group(check_initialized=False),
-                tensor_parallel_group=mcore_parallel.get_expert_tensor_parallel_group(
-                    check_initialized=False
-                ),
-                expert_model_parallel_group=mcore_parallel.get_expert_model_parallel_group(
-                    check_initialized=False
-                ),
+                mcore_parallel.get_expert_data_parallel_group(),
+                tensor_parallel_group=mcore_parallel.get_expert_tensor_parallel_group(),
+                expert_model_parallel_group=mcore_parallel.get_expert_model_parallel_group(),
             )
         # initialize parallel state for submodules linear_fc1 and linear_fc2
         self.linear_fc1.parallel_state = self.parallel_state
@@ -630,13 +626,9 @@ class _MegatronSequentialMLP(_MegatronMLP):
     def _setup(self):
         if not hasattr(self, "parallel_state") or self.parallel_state is None:
             self.parallel_state = ParallelState(
-                mcore_parallel.get_expert_data_parallel_group(check_initialized=False),
-                tensor_parallel_group=mcore_parallel.get_expert_tensor_parallel_group(
-                    check_initialized=False
-                ),
-                expert_model_parallel_group=mcore_parallel.get_expert_model_parallel_group(
-                    check_initialized=False
-                ),
+                mcore_parallel.get_expert_data_parallel_group(),
+                tensor_parallel_group=mcore_parallel.get_expert_tensor_parallel_group(),
+                expert_model_parallel_group=mcore_parallel.get_expert_model_parallel_group(),
             )
 
         # Initialize parallel state for submodules local_experts.*.linear_fc1 and local_experts.*.linear_fc2
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -515,20 +515,21 @@ def copy_weights_from_grouped_to_non_grouped(te_grouped_moe_model, sequential_mo
 
     # Map grouped weights to sequential weights
     weight_mapping = {}
-    sequential_key_template = "decoder.layers.{}.mlp.experts.local_experts.{}.linear_fc{}.weight"
+    sequential_key_template = "decoder.layers.{}.mlp.experts.local_experts.{}.linear_fc{}"
     for key, value in te_grouped_state.items():
-        if "experts.linear_fc" in key and "weight" in key:
+        if "experts.linear_fc" in key and any(param in key for param in ("weight", "bias")):
             # Extract expert index from grouped weight name
             # Format: decoder.layers.X.mlp.experts.linear_fcY.weightZ
             parts = key.split(".")
             layer_idx = parts[2]  # X
             fc_idx = parts[5]  # Y (linear_fc1 or linear_fc2)
-            weight_idx = parts[6]  # Z (weight0, weight1, etc.)
-
-            # Map to sequential format: decoder.layers.X.mlp.experts.local_experts.Y.linear_fcZ.weight
-            expert_idx = weight_idx.replace("weight", "")
+            param_idx = parts[6]  # weight0 / bias0 / etc.
+            match = re.search(r"\d+", param_idx)
+            expert_idx = match.group(0) if match else "0"  # Z for expert index
+            # Map to sequential format: decoder.layers.X.mlp.experts.local_experts.Y.linear_fcZ
             sequential_key = sequential_key_template.format(layer_idx, expert_idx, fc_idx[-1])
-            weight_mapping[sequential_key] = value
+            param_name = "weight" if "weight" in param_idx else "bias"
+            weight_mapping[f"{sequential_key}.{param_name}"] = value
         elif isinstance(value, torch.Tensor):
             weight_mapping[key] = value
 
@@ -540,7 +541,7 @@ def copy_weights_from_grouped_to_non_grouped(te_grouped_moe_model, sequential_mo
     sequential_moe_model.load_state_dict(sequential_state)
 
 
-def compare_amax_sync_across_expert_parallel(model):
+def compare_amax_sync_across_expert_parallel(model, compare_across_experts=True):
     """
     Test if amax values are synchronized across expert parallel groups.
 
@@ -591,11 +592,12 @@ def compare_amax_sync_across_expert_parallel(model):
                 quantizer_type in expert_quantizers
                 and rank_idx in expert_quantizers[quantizer_type]
             ):
-                # compare expert value across expert for sequential MoE
-                assert expert_quantizers[quantizer_type][rank_idx] == amax_val, (
-                    f"{rank_idx}, {quantizer_type}, expert_quantizers[quantizer_type][rank_idx]: "
-                    f"{expert_quantizers[quantizer_type][rank_idx]}, amax_val: {amax_val}"
-                )
+                if compare_across_experts:
+                    # compare expert value across expert for sequential MoE
+                    assert expert_quantizers[quantizer_type][rank_idx] == amax_val, (
+                        f"{rank_idx}, {quantizer_type}, expert_quantizers[quantizer_type][rank_idx]: "
+                        f"{expert_quantizers[quantizer_type][rank_idx]}, amax_val: {amax_val}"
+                    )
             expert_quantizers[quantizer_type][rank_idx] = amax_val
 
     # Check synchronization - fail fast on first inconsistency
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -677,31 +677,20 @@ def forward_fn(model):
     assert initial_sync, (
         f"Inconsistent amax for expert {quantizer_type} across ranks: {rank_values}"
     )
-    # Create inconsistent amax values
-    cur_rank = torch.distributed.get_rank()
-    for name, module in model.named_modules():
-        if isinstance(module, mtq.nn.TensorQuantizer):
-            # Check if this is an expert quantizer
-            is_expert_quantizer = (
-                "local_experts" in name  # sequential MoE
-                or ("experts" in name and "linear_fc" in name)  # TEGrouped MoE
-            )
 
-            if is_expert_quantizer and hasattr(module, "_amax"):
-                # Create rank-specific amax values to simulate missing sync
-                rank_offset = cur_rank * 0.1
-                module.amax = module.amax + rank_offset
+    # Test if the amax values are inconsistent when distributed sync is disabled
+    mtq.model_calib.max_calibrate(model, forward_fn, distributed_sync=False)
+    inconsistent_amax, _, _ = compare_amax_sync_across_expert_parallel(
+        model, compare_across_experts=False
+    )
 
-    # Test if the amax values are inconsistent
-    inconsistent_amax, _, _ = compare_amax_sync_across_expert_parallel(model)
     assert not inconsistent_amax, (
         "Consistent amax across expert parallel ranks, "
         "Amax should not be synchronized across expert parallel ranks since expert parallel is disabled"
     )
-    # Re-calibrate the model and test synchronization
-    mtq.mode.wrapped_calib_func(
-        model, mtq.config.MaxCalibConfig(), forward_fn, mtq.model_calib.max_calibrate
-    )
+    # calibrate the model with distributed sync and test synchronization
+    mtq.model_calib.max_calibrate(model, forward_fn, distributed_sync=True)
+    mtq.plugins.megatron.sync_amax_across_sequential_mlp(model)
 
     final_sync, quantizer_type, rank_values = compare_amax_sync_across_expert_parallel(model)
     assert final_sync, f"Inconsistent amax for expert {quantizer_type} across ranks: {rank_values}"