fixed tests for per-tensor support

kinjalpatel27 · kinjalpatel27 · commit 8bff6b0517a1 · 2025-10-16T21:53:52.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -506,9 +506,15 @@ def _setup(self):
             expert.linear_fc2.parallel_state = self.parallel_state
 
     def sync_moe_local_experts_amax(self):
-        """Sync amax across experts in a SequentialMLP."""
+        """Sync amax across local experts in a SequentialMLP.
+
+        amax across EP and ETP (for RowParallel) are synchronized as part of model_calib.max_calibrate().
+        This function is called to synchronize the amax values across local experts s.t. all localexperts will
+        share the same amax.
+        """
+        torch.distributed.barrier()
+        # Collect amax from all local experts
         amax_dict = {}
-        # gather amax values from SequentialMLP experts
         for expert in self.local_experts:
             for name, module in expert.named_modules():
                 if isinstance(module, TensorQuantizer) and module.amax is not None:
@@ -520,7 +526,7 @@ def sync_moe_local_experts_amax(self):
                         else torch.maximum(stored_amax, amax_tensor)
                     )
 
-        # sync amax values across experts in SequentialMLP
+        # Apply synchronized amax values back to all local experts
         for expert in self.local_experts:
             for name, module in expert.named_modules():
                 if isinstance(module, TensorQuantizer) and module.amax is not None:
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import copy
 import re
+from collections import defaultdict
 from warnings import warn
 
 import torch
@@ -41,6 +42,7 @@
 from megatron.core.parallel_state import (
     get_expert_model_parallel_group,
     get_expert_tensor_parallel_group,
+    get_expert_tensor_parallel_rank,
     initialize_model_parallel,
     is_pipeline_first_stage,
     is_pipeline_last_stage,
@@ -190,7 +192,7 @@ def squared_relu(x):
         pipeline_model_parallel_size=pipeline_model_parallel_size,
         expert_model_parallel_size=expert_model_parallel_size,
         expert_tensor_parallel_size=expert_tensor_parallel_size,
-        sequence_parallel=expert_model_parallel_size > 1,
+        sequence_parallel=False,
         moe_grouped_gemm=moe_grouped_gemm,
         num_layers=num_layers,
         num_layers_in_first_pipeline_stage=num_layers_in_first_pipeline_stage,
@@ -565,8 +567,7 @@ def compare_amax_sync_across_expert_parallel(model, compare_across_experts=True)
             # Check for both TEGrouped and sequential MoE patterns
             if "local_experts" in name or ("experts" in name and "linear_fc" in name):
                 # Convert to scalar only if tensor has a single element
-                amax_val = module.amax.detach().clone().cpu()
-                expert_amax_values[name] = amax_val
+                expert_amax_values[name] = module.amax.detach().clone().cpu()
 
     # Early return if no expert quantizers found
     assert expert_amax_values, "No expert quantizers found"
@@ -577,19 +578,16 @@ def compare_amax_sync_across_expert_parallel(model, compare_across_experts=True)
     torch.distributed.all_gather_object(all_amax_values, expert_amax_values)
 
     # Group quantizers by type (ignoring specific expert indices) and check sync
-    expert_quantizers = {}
+    expert_quantizers = defaultdict(dict)
     for rank_idx, rank_amax in enumerate(all_amax_values):
         for name, amax_val in rank_amax.items():
             # Create quantizer type key by normalizing the name
-            if "local_experts" in name:
-                # sequential MoE: replace expert index with wildcard
-                quantizer_type = re.sub(r"local_experts\.\d+", "local_experts.*", name)
-            else:
-                # TEGrouped MoE: use the name as-is since experts are grouped
-                quantizer_type = name
-
-            if quantizer_type not in expert_quantizers:
-                expert_quantizers[quantizer_type] = {}
+            quantizer_type = (
+                re.sub(r"local_experts\.\d+", "local_experts.*", name)
+                if "local_experts" in name
+                else name
+            )
+
             if (
                 quantizer_type in expert_quantizers
                 and rank_idx in expert_quantizers[quantizer_type]
@@ -608,21 +606,52 @@ def compare_amax_sync_across_expert_parallel(model, compare_across_experts=True)
                     )
             expert_quantizers[quantizer_type][rank_idx] = amax_val
 
-    # Check synchronization - fail fast on first inconsistency
+    rank_info = {
+        "global_rank": torch.distributed.get_rank(),
+        "etp_rank": get_expert_tensor_parallel_rank(),
+    }
+
+    all_rank_info = [None] * world_size
+    torch.distributed.all_gather_object(all_rank_info, rank_info)
+
+    # Group ranks by ETP rank for fc1 (ColumnParallel: same output channels should match)
+    etp_groups = defaultdict(list)
+    for info in all_rank_info:
+        etp_groups[info["etp_rank"] if info["etp_rank"] else 0].append(info["global_rank"])
+
     for quantizer_type, rank_values in expert_quantizers.items():
-        if len(rank_values) > 1:  # Only check if we have multiple ranks
-            values = list(rank_values.values())
-            # Handle both scalar and tensor comparisons
-            first_val = values[0]
-            if isinstance(first_val, torch.Tensor):
-                # For tensors, check if all values are close to the first one
-                for val in values[1:]:
-                    if not torch.allclose(first_val, val, rtol=1e-6, atol=1e-6):
-                        return False, quantizer_type, rank_values
-            else:
-                # For scalars, use numeric comparison
-                max_diff = max(values) - min(values)
-                if max_diff > 1e-6:  # Allow for small floating point differences
-                    return False, quantizer_type, rank_values
+        # Determine which ranks should have same amax
+        # Find which rank should have same amax
+        #
+        # fc1: ColumnParallel: X @ [A_1, A_2] (weights split along Cout)
+        # so amax should be the same across same ETP rank
+        # if EP is 2, ETP is 2, we have 4 ranks, EP1, ETP1: 0, EP1, ETP2: 1, EP2, ETP1: 2, EP2, ETP2: 3
+        # so we need to compare amax across same ETP rank [0, 2] [1, 3]
+        #
+        # fc2: RowParallel:    [X_1, X_2] @  [A_1
+        #                                     A_2] (weights split along Cin)
+        # amax should be the same across all ranks
+
+        rank_groups = (
+            list(etp_groups.values())
+            if "linear_fc1" in quantizer_type
+            else [list(range(world_size))]
+        )
+        # Check each group independently
+        for group in rank_groups:
+            group_values = [rank_values[r] for r in group if r in rank_values]
+            if len(group_values) > 1:
+                # All values in this group should be identical
+                first_val = group_values[0]
+                for val in group_values[1:]:
+                    if isinstance(first_val, torch.Tensor):
+                        if not torch.allclose(first_val, val, rtol=1e-6, atol=1e-6):
+                            group_rank_values = {
+                                r: rank_values[r] for r in group if r in rank_values
+                            }
+                            return False, f"{quantizer_type} (group {group})", group_rank_values
+                    elif abs(first_val - val) > 1e-6:
+                        group_rank_values = {r: rank_values[r] for r in group if r in rank_values}
+                        return False, f"{quantizer_type} (group {group})", group_rank_values
 
     return True, None, None
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -45,6 +45,7 @@
 )
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.moe.experts import SequentialMLP, TEGroupedMLP
+from megatron.core.transformer.moe.router import TopKRouter
 
 import modelopt
 import modelopt.torch.opt as mto
@@ -240,6 +241,7 @@ def _gpt_model_provider(
     ep_size=1,
     etp_size=None,
     use_te=False,
+    transformer_impl="local",
 ):
     """Build the model."""
 
@@ -253,7 +255,7 @@ def _gpt_model_provider(
                 ffn_hidden_size=None,
                 num_attention_heads=8,
                 activation_func="squared_relu",
-                transformer_impl="local",
+                transformer_impl=transformer_impl,
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
                 use_cpu_initialization=meta_device,
@@ -270,7 +272,7 @@ def _gpt_model_provider(
             ffn_hidden_size=None,
             num_attention_heads=8,
             activation_func="squared_relu",
-            transformer_impl="local",
+            transformer_impl=transformer_impl,
             hidden_size=hidden_size,
             vocab_size=vocab_size,
             num_moe_experts=num_moe_experts,
@@ -297,6 +299,7 @@ def _test_sharded_state_dict(
     num_moe_experts = moe_config.get("num_moe_experts", None)
     moe_grouped_gemm = moe_config.get("moe_grouped_gemm", False)
     use_te = moe_config.get("use_te", False)
+    transformer_impl = moe_config.get("transformer_impl", "local")
 
     initialize_for_megatron(
         tensor_model_parallel_size=tp_size,
@@ -314,6 +317,7 @@ def _test_sharded_state_dict(
         use_te=use_te,
         ep_size=ep_size,
         etp_size=etp_size,
+        transformer_impl=transformer_impl,
     )
     model_test = _gpt_model_provider(
         tp_size,
@@ -325,6 +329,7 @@ def _test_sharded_state_dict(
         meta_device=meta_device,
         ep_size=ep_size,
         etp_size=etp_size,
+        transformer_impl=transformer_impl,
     )
 
     prompt_tokens = torch.randint(
@@ -531,10 +536,7 @@ def test_fp8_real_quantize():
 
 @pytest.mark.parametrize(
     "config",
-    [
-        mtq.FP8_DEFAULT_CFG,
-        mtq.NVFP4_DEFAULT_CFG,
-    ],
+    [mtq.FP8_DEFAULT_CFG, mtq.NVFP4_DEFAULT_CFG, mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG],
 )
 @pytest.mark.parametrize("moe_grouped_gemm", [True, False])
 def test_moe_sharded_state_dict(need_4_gpus, tmp_path, config, moe_grouped_gemm):
@@ -549,6 +551,7 @@ def test_moe_sharded_state_dict(need_4_gpus, tmp_path, config, moe_grouped_gemm)
         "num_moe_experts": 4,
         "moe_grouped_gemm": moe_grouped_gemm,
         "use_te": moe_grouped_gemm,
+        "transformer_impl": "modelopt",
     }
     spawn_multiprocess_job(
         size=size,
@@ -606,6 +609,7 @@ def forward_fn(model):
         hidden_size=32,
         moe_grouped_gemm=False,
         num_moe_experts=4,
+        transformer_impl="modelopt",
     )
     num_sequential_mlp = sum(
         isinstance(module, SequentialMLP) for module in sequential_moe_model.modules()
@@ -666,10 +670,16 @@ def _test_expert_model_parallel_amax_sync(
         hidden_size=256,
         moe_grouped_gemm=moe_grouped_gemm,
         use_te=moe_grouped_gemm,
-        num_moe_experts=4,
+        num_moe_experts=8,
+        transformer_impl="modelopt",
     )
     prompt_tokens = torch.randint(0, model.vocab_size, (2, model.max_sequence_length)).cuda()
 
+    # force all expert routing
+    for module in model.modules():
+        if isinstance(module, TopKRouter):
+            module.topk = module.num_experts
+
     def forward_fn(model):
         return megatron_prefill(model, prompt_tokens)
 
@@ -701,9 +711,10 @@ def forward_fn(model):
     assert final_sync, f"Inconsistent amax for expert {quantizer_type} across ranks: {rank_values}"
 
 
+@pytest.mark.parametrize("config", [mtq.FP8_DEFAULT_CFG, mtq.INT8_DEFAULT_CFG])
 @pytest.mark.parametrize(("ep_size", "etp_size"), [(1, 2), (2, 1), (2, 2)])
 @pytest.mark.parametrize("moe_grouped_gemm", [True, False])
-def test_expert_parallel_sync(ep_size, etp_size, moe_grouped_gemm):
+def test_expert_parallel_sync(config, ep_size, etp_size, moe_grouped_gemm):
     """Test expert model parallel synchronization."""
     size = torch.cuda.device_count()
     if size < ep_size * etp_size:
@@ -716,11 +727,11 @@ def test_expert_parallel_sync(ep_size, etp_size, moe_grouped_gemm):
         size=size,
         job=partial(
             _test_expert_model_parallel_amax_sync,
-            2,
+            etp_size,  # tp_size
             ep_size,
             etp_size,
             moe_grouped_gemm,
-            mtq.FP8_DEFAULT_CFG,
+            config,
         ),
         backend="nccl",
     )