add print

jenchen13 · jenchen13 · commit fa8f4c8d8433 · 2025-10-01T01:07:14.000Z
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/tests/_test_utils/torch_quantization/quantize_common.py b/tests/_test_utils/torch_quantization/quantize_common.py
@@ -119,7 +119,7 @@ def save_restore_test(model_cls, device, quant_config, compress=False, version=N
         mto.restore_from_modelopt_state(model_ref, state_dict)
 
 
-def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX, group=None):
+def _reduce_quantizer_attr(quantizer, attr: str, op=dist.ReduceOp.MAX, group=None):
     quantizer_attr = getattr(quantizer, attr).clone()
     print("quantizer.attr before reduce", getattr(quantizer, attr))
     dist.all_reduce(quantizer_attr, op=op, group=group)
@@ -225,9 +225,46 @@ def forward_loop(model):
 
 
 @patch("modelopt.torch.quantization.model_calib.awq_lite", side_effect=_debug_awq_lite)
-def data_tensor_context_parallel_test_helper(
-    model, config, dp_group, tp_group, cp_group, mock_awq_lite
-):
+def data_tensor_context_parallel_test_helper(model, config, dp_group, tp_group, mock_awq_lite):
+    # Print rank information for debugging
+    world_rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    print("\n=== RANK INFORMATION ===")
+    print(f"World Rank: {world_rank}, World Size: {world_size}")
+
+    # Get group information with actual ranks
+    def get_group_ranks(group):
+        if group is None:
+            return None
+        ranks = []
+        ranks = [
+            i for i in range(world_size) if dist.get_rank(group=group) == dist.get_rank(group=group)
+        ]
+        return ranks
+
+    if dp_group is not None:
+        dp_rank = dist.get_rank(group=dp_group)
+        dp_size = dist.get_world_size(group=dp_group)
+        print(f"DP Group - Rank: {dp_rank}, Size: {dp_size}")
+
+    if tp_group is not None:
+        tp_rank = dist.get_rank(group=tp_group)
+        tp_size = dist.get_world_size(group=tp_group)
+        print(f"TP Group - Rank: {tp_rank}, Size: {tp_size}")
+
+    print("=== END RANK INFO ===\n")
+
+    # Print a summary of all ranks
+    print("=== ALL RANKS SUMMARY ===")
+    print(f"Total GPUs: {world_size}")
+    print(f"Current rank: {world_rank}")
+    if dp_group is not None:
+        print(f"DP groups: {dp_size} groups of {world_size // dp_size} ranks each")
+    if tp_group is not None:
+        print(f"TP groups: {tp_size} groups of {world_size // tp_size} ranks each")
+    print("=== END SUMMARY ===\n")
+
     calib_data = model.get_dummy_input().cuda()
     # data should be same across each TP rank
     dist.all_reduce(calib_data, op=dist.ReduceOp.AVG, group=tp_group)
@@ -238,14 +275,38 @@ def forward_loop(model):
     model = mtq.quantize(model, config, forward_loop)
 
     def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX):
+        world_rank = dist.get_rank()
+        print(f"\n--- Rank {world_rank}: Reducing {attr} ---")
+        from megatron.core.parallel_state import (
+            _CONTEXT_PARALLEL_GLOBAL_RANKS,
+            _DATA_PARALLEL_GLOBAL_RANKS,
+            _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP,
+            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS,
+        )
+
+        print(f"DATA_PARALLEL_GLOBAL_RANKS: {_DATA_PARALLEL_GLOBAL_RANKS}")
+        print(f"CONTEXT_PARALLEL_GLOBAL_RANKS: {_CONTEXT_PARALLEL_GLOBAL_RANKS}")
+        print(f"DATA_PARALLEL_GLOBAL_RANKS_WITH_CP: {_DATA_PARALLEL_GLOBAL_RANKS_WITH_CP}")
+        print(f"TENSOR_MODEL_PARALLEL_GLOBAL_RANKS: {_TENSOR_MODEL_PARALLEL_GLOBAL_RANKS}")
         quantizer_attr = getattr(quantizer, attr).clone()
-        print("quantizer_attr before reduce", quantizer_attr)
-        print("quantizer.attr before reduce", getattr(quantizer, attr))
-        dist.all_reduce(quantizer_attr, op=op, group=dp_group)
-        dist.all_reduce(quantizer_attr, op=op, group=cp_group)
-        dist.all_reduce(quantizer_attr, op=op, group=tp_group)
-        print("quantizer_attr after reduce", quantizer_attr)
-        print("quantizer.attr after reduce", getattr(quantizer, attr))
+        print(f"Rank {world_rank} - quantizer_attr before reduce", quantizer_attr)
+        print(f"Rank {world_rank} - quantizer.attr before reduce", getattr(quantizer, attr))
+
+        # Perform all-reduce operations
+        if tp_group is not None:
+            tp_rank = dist.get_rank(group=tp_group)
+            print(f"Rank {world_rank} - TP reduce (TP rank {tp_rank})")
+            dist.all_reduce(quantizer_attr, op=op, group=tp_group)
+
+        if dp_group is not None:
+            dp_rank = dist.get_rank(group=dp_group)
+            print(f"Rank {world_rank} - DP reduce (DP rank {dp_rank})")
+            dist.all_reduce(quantizer_attr, op=op, group=dp_group)
+
+            print(f"Rank {world_rank} - quantizer_attr after reduce", quantizer_attr)
+        print(f"Rank {world_rank} - quantizer.attr after reduce", getattr(quantizer, attr))
+        print(f"--- End Rank {world_rank} ---\n")
+
         assert torch.allclose(quantizer_attr, getattr(quantizer, attr))
 
     # Input quantizer amax
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -42,7 +42,6 @@
 import megatron.core
 from megatron.core.parallel_state import (
     destroy_model_parallel,
-    get_context_parallel_group,
     get_data_parallel_group,
     get_tensor_model_parallel_group,
 )
@@ -152,7 +151,7 @@ def _test_context_parallel_helper(config, rank, size):
     )  # modify seed so data is different across ranks
     model = MegatronModel(cp_size=size).cuda()
 
-    dp_cp_parallel_test_helper(model, config, get_context_parallel_group())
+    dp_cp_parallel_test_helper(model, config, get_data_parallel_group(with_context_parallel=True))
 
 
 @pytest.mark.parametrize(
@@ -181,9 +180,8 @@ def _test_data_tensor_context_parallel_helper(config, rank, size):
     data_tensor_context_parallel_test_helper(
         model,
         config,
-        get_data_parallel_group(),
+        get_data_parallel_group(with_context_parallel=True),
         get_tensor_model_parallel_group(),
-        get_context_parallel_group(),
     )