consolidate tests

jenchen13 · jenchen13 · commit 67611091a26d · 2025-10-09T17:08:15.000Z
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -372,7 +372,9 @@ def run_mcore_inference(
     )
 
     # Note: This is returned in all TP ranks or last PP stage in PP models
+    print("inference_input size", inference_input["tokens"].shape)
     logits = wrapped_model.run_one_forward_step(inference_input)
+    print("logits size", logits.shape)
     logits = broadcast_from_last_pipeline_stage(
         [batch_size, model.max_sequence_length, model.vocab_size],
         dtype=torch.bfloat16 if model.config.bf16 else torch.float32,
diff --git a/tests/_test_utils/torch_quantization/quantize_common.py b/tests/_test_utils/torch_quantization/quantize_common.py
@@ -135,48 +135,6 @@ def _debug_awq_lite(model, forward_loop, alpha_step=0.1, debug=True, **kwargs):
     return original_awq_lite(model, forward_loop, alpha_step, debug=True, **kwargs)
 
 
-@patch("modelopt.torch.quantization.model_calib.awq_lite", side_effect=_debug_awq_lite)
-def tensor_parallel_test_helper(model, config, tp_group, mock_awq_lite):
-    # The input to first layer, the column parallel should be the same across all tp ranks
-    calib_data = model.get_dummy_input().cuda()
-    dist.all_reduce(calib_data, op=dist.ReduceOp.AVG, group=tp_group)
-
-    def forward_loop(model):
-        model(calib_data)
-
-    model = mtq.quantize(model, config, forward_loop)
-    # Sanity check
-    forward_loop(model)
-
-    if config in [mtq.INT8_DEFAULT_CFG, mtq.FP8_DEFAULT_CFG, mtq.INT8_SMOOTHQUANT_CFG]:
-        # Lets check the amax for row parallel input quantizer; it should be the same across all tp ranks
-        _distributed_attr_check(
-            model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX, groups=[tp_group]
-        )
-        # Lets check the row parallel weight amax; it should be the same across all tp ranks
-        _distributed_attr_check(
-            model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX, groups=[tp_group]
-        )
-
-    if config in [mtq.INT8_SMOOTHQUANT_CFG, mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
-        # Lets check the column parallel pre_quant_scale; it should be the same across all tp ranks
-        input_quantizer = model.fc1.input_quantizer
-        _distributed_attr_check(
-            input_quantizer, "pre_quant_scale", dist.ReduceOp.MAX, groups=[tp_group]
-        )
-
-    if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
-        # Check activation scale for AWQ lite
-        _distributed_attr_check(
-            model.fc1.awq_lite,
-            "act_scale",
-            dist.ReduceOp.AVG,
-            groups=[tp_group],
-        )
-
-    dist.destroy_process_group()
-
-
 @patch("modelopt.torch.quantization.model_calib.awq_lite", side_effect=_debug_awq_lite)
 def data_tensor_context_parallel_test_helper(
     model, config, mock_awq_lite, dp_group=None, tp_group=None
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -92,13 +92,50 @@ def test_convert_megatron_parallel_linear(distributed_setup_size_1):
     destroy_model_parallel()
 
 
-# 1. Tensor Parallel Test
-def _test_tensor_parallel_helper(config, rank, size):
-    initialize_for_megatron(tensor_model_parallel_size=2, seed=SEED)
-    tp_group = get_tensor_model_parallel_group()
-    model = MegatronModel(tp_size=size, tp_group=tp_group).cuda()
+# Unified parallelism test helper
+def _test_parallelism_helper(
+    config,
+    rank,
+    size,
+    tensor_model_parallel_size=1,
+    context_parallel_size=1,
+    use_rank_in_seed=False,
+):
+    """
+    Unified helper for testing different parallelism configurations.
+    Args:
+        config: Quantization config to test
+        rank: Current rank in distributed setup
+        size: Total number of processes
+        tensor_model_parallel_size: Size of tensor model parallel group (default: 1)
+        context_parallel_size: Size of context parallel group (default: 1)
+        use_rank_in_seed: Whether to add rank to seed for different data across ranks (default: False)
+    """
+    seed = SEED + rank if use_rank_in_seed else SEED
+    initialize_for_megatron(
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        context_parallel_size=context_parallel_size,
+        seed=seed,
+    )
 
-    data_tensor_context_parallel_test_helper(model, config, tp_group=tp_group)
+    # Determine if we need tp_group and dp_group
+    tp_group = get_tensor_model_parallel_group() if tensor_model_parallel_size > 1 else None
+    dp_group = get_data_parallel_group(with_context_parallel=True)
+
+    # Create model with appropriate parallelism settings
+    model = MegatronModel(
+        tp_size=tensor_model_parallel_size,
+        cp_size=context_parallel_size,
+        tp_group=tp_group,
+    ).cuda()
+
+    # Call the test helper with appropriate groups
+    data_tensor_context_parallel_test_helper(
+        model,
+        config,
+        dp_group=dp_group,
+        tp_group=tp_group,
+    )
 
 
 @pytest.mark.parametrize(
@@ -115,18 +152,12 @@ def _test_tensor_parallel_helper(config, rank, size):
 )
 def test_tensor_parallel(need_2_gpus, config):
     spawn_multiprocess_job(
-        size=2, job=partial(_test_tensor_parallel_helper, config), backend="nccl"
+        size=2,
+        job=partial(_test_parallelism_helper, config, tensor_model_parallel_size=2),
+        backend="nccl",
     )
 
 
-# 2. Data Parallel Test
-def _test_data_parallel_helper(config, rank, size):
-    initialize_for_megatron(seed=SEED + rank)  # modify seed so data is different across ranks
-    model = MegatronModel().cuda()
-
-    data_tensor_context_parallel_test_helper(model, config, dp_group=get_data_parallel_group())
-
-
 @pytest.mark.parametrize(
     "config",
     [
@@ -140,18 +171,10 @@ def _test_data_parallel_helper(config, rank, size):
     ],
 )
 def test_data_parallel(need_2_gpus, config):
-    spawn_multiprocess_job(size=2, job=partial(_test_data_parallel_helper, config), backend="nccl")
-
-
-# 3. Context Parallel Test
-def _test_context_parallel_helper(config, rank, size):
-    initialize_for_megatron(
-        context_parallel_size=size, seed=SEED + rank
-    )  # modify seed so data is different across ranks
-    model = MegatronModel(cp_size=size).cuda()
-
-    data_tensor_context_parallel_test_helper(
-        model, config, dp_group=get_data_parallel_group(with_context_parallel=True)
+    spawn_multiprocess_job(
+        size=2,
+        job=partial(_test_parallelism_helper, config, use_rank_in_seed=True),
+        backend="nccl",
     )
 
 
@@ -169,21 +192,11 @@ def _test_context_parallel_helper(config, rank, size):
 )
 def test_context_parallel(need_2_gpus, config):
     spawn_multiprocess_job(
-        size=2, job=partial(_test_context_parallel_helper, config), backend="nccl"
-    )
-
-
-# 4. DP=2 + TP=2 + CP=2 Test (on 2*2*2=8 GPUs)
-def _test_data_tensor_context_parallel_helper(config, rank, size):
-    initialize_for_megatron(tensor_model_parallel_size=2, context_parallel_size=2, seed=SEED + rank)
-    tp_group = get_tensor_model_parallel_group()
-    model = MegatronModel(tp_size=2, cp_size=2, tp_group=tp_group).cuda()
-
-    data_tensor_context_parallel_test_helper(
-        model,
-        config,
-        dp_group=get_data_parallel_group(with_context_parallel=True),
-        tp_group=tp_group,
+        size=2,
+        job=partial(
+            _test_parallelism_helper, config, context_parallel_size=2, use_rank_in_seed=True
+        ),
+        backend="nccl",
     )
 
 
@@ -201,7 +214,15 @@ def _test_data_tensor_context_parallel_helper(config, rank, size):
 )
 def test_data_tensor_context_parallel(need_8_gpus, config):
     spawn_multiprocess_job(
-        size=8, job=partial(_test_data_tensor_context_parallel_helper, config), backend="nccl"
+        size=8,
+        job=partial(
+            _test_parallelism_helper,
+            config,
+            tensor_model_parallel_size=2,
+            context_parallel_size=2,
+            use_rank_in_seed=True,
+        ),
+        backend="nccl",
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -372,7 +372,9 @@ def run_mcore_inference(`
`372`	`372`	`)`
`373`	`373`
`374`	`374`	`# Note: This is returned in all TP ranks or last PP stage in PP models`
	`375`	`+ print("inference_input size", inference_input["tokens"].shape)`
`375`	`376`	`logits = wrapped_model.run_one_forward_step(inference_input)`
	`377`	`+ print("logits size", logits.shape)`
`376`	`378`	`logits = broadcast_from_last_pipeline_stage(`
`377`	`379`	`[batch_size, model.max_sequence_length, model.vocab_size],`
`378`	`380`	`dtype=torch.bfloat16 if model.config.bf16 else torch.float32,`