address comments from Yuxian

brb-nv · brb-nv · commit f16ee9becb08 · 2025-12-31T07:56:22.000-08:00
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -1506,7 +1506,7 @@ def _broadcast_cache_data(
         """Broadcast tactics from root rank to all other ranks."""
         cache_data = self.profiling_cache.get_specific_custom_op(custom_op)
         root = 0
-        cache_data = self._dist.tp_broadcast(obj=cache_data, root=root)
+        cache_data = self._dist.tp_cp_broadcast(obj=cache_data, root=root)
 
         self.profiling_cache.merge_cache_data(cache_data)
 
diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py
@@ -411,6 +411,18 @@ def cp_broadcast(self, obj, root=0, chunk_size: int = 4 * 1024 * 1024):
         comm = self.cp_comm
         return safe_broadcast(comm, obj, root=root, chunk_size=chunk_size)
 
+    def tp_cp_broadcast(self, obj, root=0, chunk_size: int = 4 * 1024 * 1024):
+        """Broadcast object across both TP and CP groups.
+
+        This is used when both TP and CP parallelism are enabled (e.g., helix parallelism).
+        First broadcasts within the TP group, then within the CP group.
+        """
+        if self.tp_size > 1:
+            obj = self.tp_broadcast(obj, root=root, chunk_size=chunk_size)
+        if self.cp_size > 1:
+            obj = self.cp_broadcast(obj, root=root, chunk_size=chunk_size)
+        return obj
+
     def tp_allgather(self, obj):
         return self.tp_comm.allgather(obj)
 
@@ -730,6 +742,19 @@ def cp_broadcast(self, obj, root=0):
                 device=torch.device("cpu"))
             return ret[0]
 
+    @log_op
+    def tp_cp_broadcast(self, obj, root=0):
+        """Broadcast object across both TP and CP groups.
+
+        This is used when both TP and CP parallelism are enabled (e.g., helix parallelism).
+        First broadcasts within the TP group, then within the CP group.
+        """
+        if self.tp_size > 1:
+            obj = self.tp_broadcast(obj, root=root)
+        if self.cp_size > 1:
+            obj = self.cp_broadcast(obj, root=root)
+        return obj
+
     @log_op
     def pp_allgather(self, obj):
         if isinstance(obj, torch.Tensor):
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -590,12 +590,7 @@ def _broadcast_new_requests(
         # Broadcast within first PP stage before send/recv chain to other PP stages.
         # This needs to cover both TP and CP ranks within the first PP stage.
         if self.dist.is_first_pp_rank:
-            if self.dist.tp_size > 1:
-                payloads = self.dist.tp_broadcast(payloads, root=0)
-            # Also broadcast within CP group when CP is enabled (helix parallelism).
-            # This ensures all CP ranks within the first PP stage receive the requests.
-            if self.dist.cp_size > 1:
-                payloads = self.dist.cp_broadcast(payloads, root=0)
+            payloads = self.dist.tp_cp_broadcast(payloads, root=0)
 
         # Tag for communication
         tag = self.dist.pp_size  # Use pp_size as tag to avoid conflicts
diff --git a/tests/unittest/_torch/distributed/test_cp_broadcast.py b/tests/unittest/_torch/distributed/test_cp_broadcast.py
@@ -208,3 +208,58 @@ def test_mpi_cp_broadcast_integration():
 if __name__ == "__main__":
     # Allow running directly with mpirun
     pytest.main([__file__, "-v"])
+
+
+class TestMPIDistTpCpBroadcast:
+    """Tests for MPIDist.tp_cp_broadcast functionality."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        """Set up MPI environment and mapping for each test."""
+        skip_if_not_mpi()
+        self.rank, self.world_size = get_mpi_info()
+
+        # Set up mapping with both TP and CP enabled
+        # For 2 ranks: tp_size=1, cp_size=2 (tp_cp_broadcast will only do cp_broadcast)
+        self.mapping = Mapping(
+            world_size=self.world_size,
+            rank=self.rank,
+            tp_size=1,
+            cp_size=self.world_size,
+            pp_size=1,
+        )
+        self.dist = MPIDist(mapping=self.mapping)
+
+    def test_tp_cp_broadcast_python_dict(self):
+        """Test broadcasting a Python dictionary via tp_cp_broadcast."""
+        root = 0
+
+        # Only rank 0 in both TP and CP groups should have the object
+        if self.mapping.tp_rank == root and self.mapping.cp_rank == root:
+            obj = {
+                "model_name": "llama",
+                "batch_size": 32,
+                "tokens": [1, 2, 3, 4, 5],
+            }
+        else:
+            obj = None
+
+        result = self.dist.tp_cp_broadcast(obj, root=root)
+
+        # Verify all ranks received the correct object
+        assert result["model_name"] == "llama"
+        assert result["batch_size"] == 32
+        assert result["tokens"] == [1, 2, 3, 4, 5]
+
+    def test_tp_cp_broadcast_python_list(self):
+        """Test broadcasting a Python list via tp_cp_broadcast."""
+        root = 0
+
+        if self.mapping.tp_rank == root and self.mapping.cp_rank == root:
+            obj = ["request1", "request2", {"id": 123, "data": [1, 2, 3]}]
+        else:
+            obj = None
+
+        result = self.dist.tp_cp_broadcast(obj, root=root)
+
+        assert result == ["request1", "request2", {"id": 123, "data": [1, 2, 3]}]
diff --git a/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py b/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py
@@ -337,6 +337,21 @@ def run_object_broadcast(self, root_obj, root: int = 0):
         # After broadcast, all CP ranks should have the same object
         return result == root_obj
 
+    def run_tp_cp_broadcast(self, root_obj, root: int = 0):
+        """Test broadcasting an object via tp_cp_broadcast."""
+        # For tp_cp_broadcast, only rank 0 in both TP and CP should have the object
+        tp_rank = self.mapping.tp_rank
+        cp_rank = self.mapping.cp_rank
+        if tp_rank == root and cp_rank == root:
+            obj = root_obj
+        else:
+            obj = None
+
+        result = self.dist.tp_cp_broadcast(obj, root=root)
+
+        # After broadcast, all TP and CP ranks should have the same object
+        return result == root_obj
+
 
 @pytest.mark.gpu2
 @pytest.mark.parametrize("hidden_size", [128, 512], ids=lambda x: f"hidden:{x}")
@@ -422,3 +437,52 @@ def test_cp_broadcast_object(setup_ray_cluster, test_object):
     ])
     for r in results:
         assert r is True, f"Object broadcast from root=0 failed for {type(test_object)}"
+
+
+@pytest.mark.gpu2
+@pytest.mark.parametrize("test_object", [
+    {
+        "key1": "value1",
+        "key2": [1, 2, 3]
+    },
+    ["item1", "item2", {
+        "nested": True
+    }],
+    "simple_string",
+],
+                         ids=["dict", "list", "string"])
+def test_tp_cp_broadcast(setup_ray_cluster, test_object):
+    """Test TorchDist.tp_cp_broadcast with various objects.
+
+    This tests the combined TP+CP broadcast which is used when both tensor
+    and context parallelism are enabled (e.g., helix parallelism).
+    """
+    world_size = 2
+    tp_size = 1
+    cp_size = 2  # Enable context parallelism (tp_cp_broadcast will only do cp_broadcast)
+
+    runtime_env = ray.runtime_env.RuntimeEnv()
+    runtime_env["env_vars"] = os.environ.copy()
+    runtime_env["env_vars"].update({
+        "TLLM_DISABLE_MPI": "1",
+        "MASTER_ADDR": "127.0.0.1",
+    })
+
+    remote_tests = []
+    for rank in range(world_size):
+        remote_tests.append(
+            CpBroadcastTest.options(runtime_env=runtime_env).remote(
+                rank, world_size, tp_size, cp_size))
+
+    ray.get([test.__ray_ready__.remote() for test in remote_tests])
+
+    port = ray.get(remote_tests[0].setup_tcp_store.remote())
+    ray.get([test.setup_distributed_env.remote(port) for test in remote_tests])
+
+    # Test tp_cp_broadcast from root=0
+    results = ray.get([
+        test.run_tp_cp_broadcast.remote(test_object, root=0)
+        for test in remote_tests
+    ])
+    for r in results:
+        assert r is True, f"tp_cp_broadcast from root=0 failed for {type(test_object)}"