Updated moe names in tests

kinjalpatel27 · kinjalpatel27 · commit 17df5ca50c1c · 2025-10-09T00:05:41.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/mode.py b/modelopt/torch/quantization/mode.py
@@ -293,10 +293,7 @@ def convert(self) -> ConvertEntrypoint:
         def wrapped_func(model, config, forward_loop=None):
             # Access _calib_func as a class attribute to avoid binding
             # Check if _calib_func is defined as a class attribute
-            calib_results = wrapped_calib_func(
-                model, config, forward_loop, func=self.__class__._calib_func
-            )
-            return calib_results
+            return wrapped_calib_func(model, config, forward_loop, func=self.__class__._calib_func)
 
         return wrapped_func
 
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -497,15 +497,15 @@ def convert_maybe_fp8(v):
     )
 
 
-def copy_weights_from_grouped_to_non_grouped(grouped_model, non_grouped_model):
-    """Copy weights from grouped MoE model to non-grouped MoE model."""
-    grouped_state = grouped_model.state_dict()
-    non_grouped_state = non_grouped_model.state_dict()
+def copy_weights_from_grouped_to_non_grouped(te_grouped_moe_model, sequential_moe_model):
+    """Copy weights from TEGrouped MoE model to sequential MoE model."""
+    te_grouped_state = te_grouped_moe_model.state_dict()
+    sequential_state = sequential_moe_model.state_dict()
 
-    # Map grouped weights to non-grouped weights
+    # Map grouped weights to sequential weights
     weight_mapping = {}
-    non_grouped_key_template = "decoder.layers.{}.mlp.experts.local_experts.{}.linear_fc{}.weight"
-    for key, value in grouped_state.items():
+    sequential_key_template = "decoder.layers.{}.mlp.experts.local_experts.{}.linear_fc{}.weight"
+    for key, value in te_grouped_state.items():
         if "experts.linear_fc" in key and "weight" in key:
             # Extract expert index from grouped weight name
             # Format: decoder.layers.X.mlp.experts.linear_fcY.weightZ
@@ -514,19 +514,19 @@ def copy_weights_from_grouped_to_non_grouped(grouped_model, non_grouped_model):
             fc_idx = parts[5]  # Y (linear_fc1 or linear_fc2)
             weight_idx = parts[6]  # Z (weight0, weight1, etc.)
 
-            # Map to non-grouped format: decoder.layers.X.mlp.experts.local_experts.Y.linear_fcZ.weight
+            # Map to sequential format: decoder.layers.X.mlp.experts.local_experts.Y.linear_fcZ.weight
             expert_idx = weight_idx.replace("weight", "")
-            non_grouped_key = non_grouped_key_template.format(layer_idx, expert_idx, fc_idx[-1])
-            weight_mapping[non_grouped_key] = value
+            sequential_key = sequential_key_template.format(layer_idx, expert_idx, fc_idx[-1])
+            weight_mapping[sequential_key] = value
         elif isinstance(value, torch.Tensor):
             weight_mapping[key] = value
 
-    # Copy weights to non-grouped model
-    for non_grouped_key in non_grouped_state:
-        if non_grouped_key in weight_mapping:
-            non_grouped_state[non_grouped_key] = weight_mapping[non_grouped_key].clone()
+    # Copy weights to sequential model
+    for sequential_key in sequential_state:
+        if sequential_key in weight_mapping:
+            sequential_state[sequential_key] = weight_mapping[sequential_key].clone()
 
-    non_grouped_model.load_state_dict(non_grouped_state)
+    sequential_moe_model.load_state_dict(sequential_state)
 
 
 def compare_amax_sync_across_expert_parallel(model):
@@ -549,7 +549,7 @@ def compare_amax_sync_across_expert_parallel(model):
     expert_amax_values = {}
     for name, module in model.named_modules():
         if isinstance(module, mtq.nn.TensorQuantizer) and hasattr(module, "_amax"):
-            # Check for both grouped and non-grouped MoE patterns
+            # Check for both TEGrouped and sequential MoE patterns
             if "local_experts" in name or ("experts" in name and "linear_fc" in name):
                 expert_amax_values[name] = (
                     module.amax.item() if hasattr(module.amax, "item") else module.amax
@@ -569,10 +569,10 @@ def compare_amax_sync_across_expert_parallel(model):
         for name, amax_val in rank_amax.items():
             # Create quantizer type key by normalizing the name
             if "local_experts" in name:
-                # Non-grouped MoE: replace expert index with wildcard
+                # sequential MoE: replace expert index with wildcard
                 quantizer_type = re.sub(r"local_experts\.\d+", "local_experts.*", name)
             else:
-                # Grouped MoE: use the name as-is since experts are grouped
+                # TEGrouped MoE: use the name as-is since experts are grouped
                 quantizer_type = name
 
             if quantizer_type not in expert_quantizers:
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -544,8 +544,8 @@ def test_moe_sharded_state_dict(need_8_gpus, tmp_path, config):
     )
 
 
-def _test_grouped_vs_non_grouped_quantize_helper(tp_size, ep_size, etp_size, rank, size):
-    """Test that grouped and non-grouped MoE models produce similar amax values."""
+def _test_te_grouped_vs_sequential_quantize_helper(tp_size, ep_size, etp_size, rank, size):
+    """Test that TEGrouped and sequential MoE models produce similar amax values."""
     initialize_for_megatron(
         tensor_model_parallel_size=tp_size,
         expert_model_parallel_size=ep_size,
@@ -559,8 +559,8 @@ def _test_grouped_vs_non_grouped_quantize_helper(tp_size, ep_size, etp_size, ran
     def forward_fn(model):
         return megatron_prefill(model, prompt_tokens)
 
-    # Create grouped MoE model
-    grouped_moe_model = _gpt_model_provider(
+    # Create TEGrouped MoE model
+    te_grouped_moe_model = _gpt_model_provider(
         tp_size=tp_size,
         ep_size=ep_size,
         etp_size=etp_size,
@@ -569,14 +569,14 @@ def forward_fn(model):
         use_te=True,
         num_moe_experts=4,
     )
-    num_grouped_mlp = sum(
-        isinstance(module, TEGroupedMLP) for module in grouped_moe_model.modules()
+    num_te_grouped_mlp = sum(
+        isinstance(module, TEGroupedMLP) for module in te_grouped_moe_model.modules()
     )
-    assert num_grouped_mlp == 4, (
-        f"TEGrupedMoEModel has {num_grouped_mlp} TEGroupedMLP modules, it should have 4"
+    assert num_te_grouped_mlp == 4, (
+        f"TEGrupedMoEModel has {num_te_grouped_mlp} TEGroupedMLP modules, it should have 4"
     )
 
-    # Create non-grouped MoE model
+    # Create sequential MoE model
     sequential_moe_model = _gpt_model_provider(
         tp_size=tp_size,
         ep_size=ep_size,
@@ -592,37 +592,37 @@ def forward_fn(model):
         f"SequentialMoEModel has {num_sequential_mlp} SequentialMLP modules, it should have 4"
     )
     # Copy weights from grouped to non-grouped model
-    copy_weights_from_grouped_to_non_grouped(grouped_moe_model, sequential_moe_model)
+    copy_weights_from_grouped_to_non_grouped(te_grouped_moe_model, sequential_moe_model)
 
     # Compare model outputs before quantization
-    grouped_moe_output = forward_fn(grouped_moe_model)
-    non_grouped_moe_output = forward_fn(sequential_moe_model)
-    assert torch.allclose(grouped_moe_output, non_grouped_moe_output, atol=1e-6, rtol=1e-6)
+    te_grouped_moe_output = forward_fn(te_grouped_moe_model)
+    sequential_moe_output = forward_fn(sequential_moe_model)
+    assert torch.allclose(te_grouped_moe_output, sequential_moe_output, atol=1e-6, rtol=1e-6)
 
     # Quantize grouped model
-    mtq.quantize(grouped_moe_model, mtq.FP8_DEFAULT_CFG, forward_fn)
+    mtq.quantize(te_grouped_moe_model, mtq.FP8_DEFAULT_CFG, forward_fn)
 
     # Quantize non-grouped model
     mtq.quantize(sequential_moe_model, mtq.FP8_DEFAULT_CFG, forward_fn)
 
     # Compare model outputs after quantization
-    grouped_moe_quant_output = forward_fn(grouped_moe_model)
-    non_grouped_moe_quant_output = forward_fn(sequential_moe_model)
+    te_grouped_moe_quant_output = forward_fn(te_grouped_moe_model)
+    sequential_moe_quant_output = forward_fn(sequential_moe_model)
     assert torch.allclose(
-        grouped_moe_quant_output, non_grouped_moe_quant_output, atol=1e-6, rtol=1e-6
+        te_grouped_moe_quant_output, sequential_moe_quant_output, atol=1e-6, rtol=1e-6
     )
 
 
-def test_grouped_vs_non_grouped_quantize():
-    """Test that grouped and non-grouped MoE models produce similar quantized models."""
+def test_te_grouped_vs_sequential_quantize():
+    """Test that TEGrouped and sequential MoE models produce similar quantized models."""
 
     size = torch.cuda.device_count()
     if size < 4:
         pytest.skip("Requires at least 4 GPUs for expert parallel test")
 
     spawn_multiprocess_job(
         size=size,
-        job=partial(_test_grouped_vs_non_grouped_quantize_helper, 1, 2, 2),
+        job=partial(_test_te_grouped_vs_sequential_quantize_helper, 1, 2, 2),
         backend="nccl",
     )
 
@@ -666,8 +666,8 @@ def forward_fn(model):
         if isinstance(module, mtq.nn.TensorQuantizer):
             # Check if this is an expert quantizer
             is_expert_quantizer = (
-                "local_experts" in name  # Non-grouped MoE
-                or ("experts" in name and "linear_fc" in name)  # Grouped MoE
+                "local_experts" in name  # sequential MoE
+                or ("experts" in name and "linear_fc" in name)  # TEGrouped MoE
             )
 
             if is_expert_quantizer and hasattr(module, "_amax"):