add unit tests

brb-nv · brb-nv · commit 658663e7901d · 2025-12-28T21:12:30.000Z
diff --git a/tests/unittest/_torch/distributed/test_cp_broadcast.py b/tests/unittest/_torch/distributed/test_cp_broadcast.py
@@ -0,0 +1,213 @@
+"""
+Tests for cp_broadcast functionality in both MPIDist and TorchDist.
+
+This module tests the context parallelism broadcast operation which is used
+when CP (context parallelism) is enabled (e.g., in Helix parallelism).
+
+For MPIDist tests, run with mpirun:
+    mpirun -n 2 python -m pytest tests/unittest/_torch/distributed/test_cp_broadcast.py -v
+
+For TorchDist tests, see test_ops.py which uses Ray for distributed testing.
+"""
+
+import numpy as np
+import pytest
+import torch
+
+from tensorrt_llm._torch.distributed import MPIDist
+from tensorrt_llm.mapping import Mapping
+
+
+def get_mpi_info():
+    """Get MPI rank and world size, returns (0, 1) if MPI is not available."""
+    try:
+        from mpi4py import MPI
+        comm = MPI.COMM_WORLD
+        return comm.Get_rank(), comm.Get_size()
+    except ImportError:
+        return 0, 1
+
+
+def skip_if_not_mpi():
+    """Skip test if not running under MPI with sufficient ranks."""
+    rank, world_size = get_mpi_info()
+    if world_size < 2:
+        pytest.skip("Test requires at least 2 MPI ranks (run with mpirun -n 2)")
+
+
+class TestMPIDistCpBroadcast:
+    """Tests for MPIDist.cp_broadcast functionality."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        """Set up MPI environment and mapping for each test."""
+        skip_if_not_mpi()
+        self.rank, self.world_size = get_mpi_info()
+
+        # Set up mapping with CP enabled (cp_size = world_size, tp_size = 1)
+        self.mapping = Mapping(
+            world_size=self.world_size,
+            rank=self.rank,
+            tp_size=1,
+            cp_size=self.world_size,
+            pp_size=1,
+        )
+        self.dist = MPIDist(mapping=self.mapping)
+
+    def test_broadcast_numpy_array(self):
+        """Test broadcasting a numpy array via cp_broadcast."""
+        root = 0
+        shape = (64, 128)
+
+        if self.mapping.cp_rank == root:
+            # Root rank creates the data to broadcast
+            data = np.random.randn(*shape).astype(np.float32)
+        else:
+            # Non-root ranks have empty/zero data
+            data = np.zeros(shape, dtype=np.float32)
+
+        # Store original data from root for verification
+        from mpi4py import MPI
+        expected = np.zeros(shape, dtype=np.float32)
+        MPI.COMM_WORLD.Bcast(data if self.mapping.cp_rank == root else expected,
+                             root=root)
+        if self.mapping.cp_rank == root:
+            expected = data.copy()
+
+        # Perform cp_broadcast
+        result = self.dist.cp_broadcast(data, root=root)
+
+        # Verify all ranks have the same data
+        np.testing.assert_array_almost_equal(result, expected)
+
+    def test_broadcast_python_dict(self):
+        """Test broadcasting a Python dictionary via cp_broadcast."""
+        root = 0
+
+        if self.mapping.cp_rank == root:
+            obj = {
+                "model_name": "llama",
+                "batch_size": 32,
+                "tokens": [1, 2, 3, 4, 5],
+                "config": {"hidden_size": 4096, "num_layers": 32}
+            }
+        else:
+            obj = None
+
+        result = self.dist.cp_broadcast(obj, root=root)
+
+        # Verify all ranks received the correct object
+        assert result["model_name"] == "llama"
+        assert result["batch_size"] == 32
+        assert result["tokens"] == [1, 2, 3, 4, 5]
+        assert result["config"]["hidden_size"] == 4096
+        assert result["config"]["num_layers"] == 32
+
+    def test_broadcast_python_list(self):
+        """Test broadcasting a Python list via cp_broadcast."""
+        root = 0
+
+        if self.mapping.cp_rank == root:
+            obj = ["request1", "request2", {"id": 123, "data": [1, 2, 3]}]
+        else:
+            obj = None
+
+        result = self.dist.cp_broadcast(obj, root=root)
+
+        assert result == ["request1", "request2", {"id": 123, "data": [1, 2, 3]}]
+
+    def test_broadcast_from_non_zero_root(self):
+        """Test broadcasting from a non-zero root rank."""
+        if self.world_size < 2:
+            pytest.skip("Need at least 2 ranks to test non-zero root")
+
+        root = 1  # Broadcast from rank 1
+
+        if self.mapping.cp_rank == root:
+            obj = {"source": "rank1", "value": 42}
+        else:
+            obj = None
+
+        result = self.dist.cp_broadcast(obj, root=root)
+
+        assert result["source"] == "rank1"
+        assert result["value"] == 42
+
+    def test_broadcast_large_object(self):
+        """Test broadcasting a large object that may require chunking."""
+        root = 0
+        # Create a large list to test chunking behavior
+        large_size = 100000
+
+        if self.mapping.cp_rank == root:
+            obj = list(range(large_size))
+        else:
+            obj = None
+
+        result = self.dist.cp_broadcast(obj, root=root)
+
+        assert len(result) == large_size
+        assert result[0] == 0
+        assert result[-1] == large_size - 1
+
+    def test_broadcast_string(self):
+        """Test broadcasting a simple string via cp_broadcast."""
+        root = 0
+
+        if self.mapping.cp_rank == root:
+            obj = "Hello from root rank!"
+        else:
+            obj = None
+
+        result = self.dist.cp_broadcast(obj, root=root)
+
+        assert result == "Hello from root rank!"
+
+
+# Additional integration-style test that can be run standalone
+def test_mpi_cp_broadcast_integration():
+    """
+    Integration test for MPIDist cp_broadcast.
+
+    Run with: mpirun -n 2 python -m pytest tests/unittest/_torch/distributed/test_cp_broadcast.py::test_mpi_cp_broadcast_integration -v
+    """
+    rank, world_size = get_mpi_info()
+    if world_size < 2:
+        pytest.skip("Test requires at least 2 MPI ranks")
+
+    # Create mapping with CP enabled
+    mapping = Mapping(
+        world_size=world_size,
+        rank=rank,
+        tp_size=1,
+        cp_size=world_size,
+        pp_size=1,
+    )
+    dist = MPIDist(mapping=mapping)
+
+    # Test 1: Broadcast dict
+    if mapping.cp_rank == 0:
+        payload = {"requests": [{"id": i} for i in range(10)]}
+    else:
+        payload = None
+
+    result = dist.cp_broadcast(payload, root=0)
+    assert len(result["requests"]) == 10
+    assert result["requests"][0]["id"] == 0
+
+    # Test 2: Broadcast numpy array
+    shape = (32, 64)
+    if mapping.cp_rank == 0:
+        arr = np.ones(shape, dtype=np.float32) * (rank + 1)
+    else:
+        arr = np.zeros(shape, dtype=np.float32)
+
+    result = dist.cp_broadcast(arr, root=0)
+    expected_val = 1.0  # From rank 0
+    np.testing.assert_array_almost_equal(result, np.ones(shape) * expected_val)
+
+
+if __name__ == "__main__":
+    # Allow running directly with mpirun
+    pytest.main([__file__, "-v"])
+
diff --git a/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py b/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py
@@ -258,3 +258,162 @@ def test_allreduce_pg_op(setup_ray_cluster, seq_len, hidden_size):
     ])
     for r in results:
         assert r is True
+
+
+@ray.remote(num_gpus=1)
+class CpBroadcastTest:
+    """Test worker for cp_broadcast operations with context parallelism."""
+
+    def __init__(self, rank, world_size, tp_size, cp_size):
+        self.rank = rank
+        self.world_size = world_size
+        self.tp_size = tp_size
+        self.cp_size = cp_size
+        self.master_address = os.environ["MASTER_ADDR"]
+
+        assert len(ray.get_gpu_ids()) == 1
+        self.gpu = int(ray.get_gpu_ids()[0])
+        from tensorrt_llm.executor.ray_gpu_worker import RayWorkerWrapper
+        local_gpu = RayWorkerWrapper.physical_to_local_id(self.gpu)
+        torch.cuda.set_device(local_gpu)
+
+    def _create_tcp_store(self,
+                          port: Optional[int] = None
+                          ) -> torch.distributed.TCPStore:
+        actual_port = port if port is not None else 0
+        return torch.distributed.TCPStore(host_name=self.master_address,
+                                          port=actual_port,
+                                          world_size=self.world_size,
+                                          is_master=(self.rank == 0),
+                                          wait_for_workers=False)
+
+    def setup_tcp_store(self):
+        if self.rank != 0:
+            raise RuntimeError("Only the master worker can setup TCP store")
+        self.store = self._create_tcp_store()
+        return self.store.port
+
+    def setup_distributed_env(self, port: int):
+        if self.rank != 0:
+            self.store = self._create_tcp_store(port)
+
+        torch.distributed.init_process_group(backend="cuda:nccl,cpu:gloo",
+                                             store=self.store,
+                                             world_size=self.world_size,
+                                             rank=self.rank)
+        self.mapping = Mapping(world_size=self.world_size,
+                               gpus_per_node=self.world_size,
+                               tp_size=self.tp_size,
+                               cp_size=self.cp_size,
+                               rank=self.rank)
+        self.dist = TorchDist(self.mapping)
+
+    def run_tensor_broadcast(self, root_tensor: torch.Tensor, root: int = 0):
+        """Test broadcasting a tensor via cp_broadcast."""
+        cp_rank = self.mapping.cp_rank
+        if cp_rank == root:
+            # Root rank has the tensor to broadcast
+            tensor = root_tensor.cuda()
+        else:
+            # Non-root ranks start with zeros
+            tensor = torch.zeros_like(root_tensor).cuda()
+
+        result = self.dist.cp_broadcast(tensor, root=root)
+
+        # After broadcast, all CP ranks should have the same tensor
+        expected = root_tensor.cuda()
+        return torch.allclose(result, expected)
+
+    def run_object_broadcast(self, root_obj, root: int = 0):
+        """Test broadcasting a non-tensor object via cp_broadcast."""
+        cp_rank = self.mapping.cp_rank
+        if cp_rank == root:
+            obj = root_obj
+        else:
+            obj = None
+
+        result = self.dist.cp_broadcast(obj, root=root)
+
+        # After broadcast, all CP ranks should have the same object
+        return result == root_obj
+
+
+@pytest.mark.gpu2
+@pytest.mark.parametrize("hidden_size", [128, 512],
+                         ids=lambda x: f"hidden:{x}")
+@pytest.mark.parametrize("seq_len", [16, 32], ids=lambda x: f"seqlen:{x}")
+def test_cp_broadcast_tensor(setup_ray_cluster, seq_len, hidden_size):
+    """Test TorchDist.cp_broadcast with tensor data."""
+    torch.manual_seed(42)
+    dtype = torch.bfloat16
+    world_size = 2
+    tp_size = 1
+    cp_size = 2  # Enable context parallelism
+
+    # Create tensor to broadcast from root
+    root_tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
+
+    runtime_env = ray.runtime_env.RuntimeEnv()
+    runtime_env["env_vars"] = os.environ.copy()
+    runtime_env["env_vars"].update({
+        "TLLM_DISABLE_MPI": "1",
+        "MASTER_ADDR": "127.0.0.1",
+    })
+
+    remote_tests = []
+    for rank in range(world_size):
+        remote_tests.append(
+            CpBroadcastTest.options(runtime_env=runtime_env).remote(
+                rank, world_size, tp_size, cp_size))
+
+    ray.get([test.__ray_ready__.remote() for test in remote_tests])
+
+    port = ray.get(remote_tests[0].setup_tcp_store.remote())
+    ray.get([test.setup_distributed_env.remote(port) for test in remote_tests])
+
+    # Test broadcasting from root=0
+    results = ray.get([
+        test.run_tensor_broadcast.remote(root_tensor, root=0)
+        for test in remote_tests
+    ])
+    for r in results:
+        assert r is True, "Tensor broadcast from root=0 failed"
+
+
+@pytest.mark.gpu2
+@pytest.mark.parametrize("test_object", [
+    {"key1": "value1", "key2": [1, 2, 3]},
+    ["item1", "item2", {"nested": True}],
+    "simple_string",
+], ids=["dict", "list", "string"])
+def test_cp_broadcast_object(setup_ray_cluster, test_object):
+    """Test TorchDist.cp_broadcast with non-tensor objects."""
+    world_size = 2
+    tp_size = 1
+    cp_size = 2  # Enable context parallelism
+
+    runtime_env = ray.runtime_env.RuntimeEnv()
+    runtime_env["env_vars"] = os.environ.copy()
+    runtime_env["env_vars"].update({
+        "TLLM_DISABLE_MPI": "1",
+        "MASTER_ADDR": "127.0.0.1",
+    })
+
+    remote_tests = []
+    for rank in range(world_size):
+        remote_tests.append(
+            CpBroadcastTest.options(runtime_env=runtime_env).remote(
+                rank, world_size, tp_size, cp_size))
+
+    ray.get([test.__ray_ready__.remote() for test in remote_tests])
+
+    port = ray.get(remote_tests[0].setup_tcp_store.remote())
+    ray.get([test.setup_distributed_env.remote(port) for test in remote_tests])
+
+    # Test broadcasting object from root=0
+    results = ray.get([
+        test.run_object_broadcast.remote(test_object, root=0)
+        for test in remote_tests
+    ])
+    for r in results:
+        assert r is True, f"Object broadcast from root=0 failed for {type(test_object)}"