update pytest and python API to fix ut failure (#598)

Binyang2014 · web-flow · commit 658411ccc41f · 2025-08-05T15:17:33.000-07:00
update pytest and python API to fix ut failure
diff --git a/python/mscclpp/comm.py b/python/mscclpp/comm.py
@@ -87,6 +87,7 @@ def make_connection(
         self,
         all_ranks: list[int],
         endpoints: EndpointConfig | Transport | dict[int, EndpointConfig] | dict[int, Transport],
+        use_switch: bool = False,
     ) -> dict[int, Connection]:
         if type(endpoints) is Transport:
             endpoints = EndpointConfig(endpoints)
@@ -98,7 +99,7 @@ def make_connection(
                 endpoint = endpoints[rank]
             else:
                 endpoint = endpoints
-            if endpoint.transport == Transport.Nvls:
+            if endpoint.transport == Transport.CudaIpc and use_switch:
                 return connect_nvls_collective(self.communicator, all_ranks, 2**30)
             else:
                 connections[rank] = self.communicator.connect(endpoint, rank)
diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py
@@ -440,7 +440,7 @@ def __init__(
         self.group.barrier()
         # create a connection for each remote neighbor
         self.nvlink_connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
-        self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
+        self.nvls_connection = group.make_connection(all_ranks, Transport.CudaIpc, use_switch=True)
         self.memory = GpuBuffer(nelem, memory_dtype)
         self.nvls_mem_handle = self.nvls_connection.bind_allocated_memory(
             self.memory.data.ptr, self.memory.data.mem.size
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
@@ -141,33 +141,33 @@ def init_target():
     mpi_group.comm.barrier()
 
 
-def create_connection(group: mscclpp_comm.CommGroup, transport: str):
-    if transport == "NVLS":
+def create_connection(group: mscclpp_comm.CommGroup, connection_type: str):
+    if connection_type == "NVLS":
         all_ranks = list(range(group.nranks))
-        tran = Transport.Nvls
-        connection = group.make_connection(all_ranks, tran)
+        tran = Transport.CudaIpc
+        connection = group.make_connection(all_ranks, tran, use_switch=True)
         return connection
 
     remote_nghrs = list(range(group.nranks))
     remote_nghrs.remove(group.my_rank)
-    if transport == "NVLink":
+    if connection_type == "NVLink":
         tran = Transport.CudaIpc
-    elif transport == "IB":
+    elif connection_type == "IB":
         tran = group.my_ib_device(group.my_rank % 8)
     else:
         assert False
     connections = group.make_connection(remote_nghrs, tran)
     return connections
 
 
-def create_group_and_connection(mpi_group: MpiGroup, transport: str):
-    if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
+def create_group_and_connection(mpi_group: MpiGroup, connection_type: str):
+    if (connection_type == "NVLink" or connection_type == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
         pytest.skip("cannot use nvlink/nvls for cross node")
     group = mscclpp_comm.CommGroup(mpi_group.comm)
     try:
-        connection = create_connection(group, transport)
+        connection = create_connection(group, connection_type)
     except Error as e:
-        if transport == "IB" and e.args[0] == ErrorCode.InvalidUsage:
+        if connection_type == "IB" and e.args[0] == ErrorCode.InvalidUsage:
             pytest.skip("IB not supported on this node")
         raise
     return group, connection
@@ -194,10 +194,10 @@ def test_gpu_buffer(mpi_group: MpiGroup, nelem: int, dtype: cp.dtype):
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("transport", ["IB", "NVLink"])
+@pytest.mark.parametrize("connection_type", ["IB", "NVLink"])
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
-def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
-    group, connections = create_group_and_connection(mpi_group, transport)
+def test_connection_write(mpi_group: MpiGroup, connection_type: str, nelem: int):
+    group, connections = create_group_and_connection(mpi_group, connection_type)
     memory = GpuBuffer(nelem, dtype=cp.int32)
     nelemPerRank = nelem // group.nranks
     sizePerRank = nelemPerRank * memory.itemsize
@@ -229,16 +229,16 @@ def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int)
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("transport", ["IB", "NVLink"])
+@pytest.mark.parametrize("connection_type", ["IB", "NVLink"])
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20, 27]])
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
-def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport, nelem: int, device: str):
+def test_connection_write_and_signal(mpi_group: MpiGroup, connection_type: str, nelem: int, device: str):
     # this test starts with a random tensor on rank 0 and rotates it all the way through all ranks
     # and finally, comes back to rank 0 to make sure it matches all the original values
 
-    if device == "cpu" and transport == "NVLink":
+    if device == "cpu" and connection_type == "NVLink":
         pytest.skip("nvlink doesn't work with host allocated memory")
-    group, connections = create_group_and_connection(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, connection_type)
     xp = cp if device == "cuda" else np
     if group.my_rank == 0:
         memory = xp.random.randn(nelem)
@@ -339,7 +339,7 @@ def test_nvls_connection(mpi_group: MpiGroup):
         pytest.skip("cannot use nvls for cross node")
     group = mscclpp_comm.CommGroup(mpi_group.comm)
     all_ranks = list(range(group.nranks))
-    nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
+    nvls_connection = group.make_connection(all_ranks, Transport.CudaIpc, use_switch=True)
     memory1 = GpuBuffer(2**29, cp.int8)
     memory2 = GpuBuffer(2**29, cp.int8)
     memory3 = GpuBuffer(2**29, cp.int8)
@@ -449,13 +449,13 @@ def __call__(self):
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("transport", ["NVLink", "IB"])
-def test_h2d_semaphores(mpi_group: MpiGroup, transport: str):
+@pytest.mark.parametrize("connection_type", ["NVLink", "IB"])
+def test_h2d_semaphores(mpi_group: MpiGroup, connection_type: str):
     def signal(semaphores):
         for rank in semaphores:
             semaphores[rank].signal()
 
-    group, connections = create_group_and_connection(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, connection_type)
 
     semaphores = group.make_semaphore(connections, Host2DeviceSemaphore)
     kernel = MscclppKernel("h2d_semaphore", group.my_rank, group.nranks, semaphores)
@@ -530,9 +530,9 @@ def test_fifo(
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
-@pytest.mark.parametrize("transport", ["IB", "NVLink"])
-def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
-    group, connections = create_group_and_connection(mpi_group, transport)
+@pytest.mark.parametrize("connection_type", ["IB", "NVLink"])
+def test_proxy(mpi_group: MpiGroup, nelem: int, connection_type: str):
+    group, connections = create_group_and_connection(mpi_group, connection_type)
 
     memory = GpuBuffer(nelem, dtype=cp.int32)
     nelemPerRank = nelem // group.nranks
@@ -579,10 +579,10 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
-@pytest.mark.parametrize("transport", ["NVLink", "IB"])
+@pytest.mark.parametrize("connection_type", ["NVLink", "IB"])
 @pytest.mark.parametrize("use_packet", [False, True])
-def test_port_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
-    group, connections = create_group_and_connection(mpi_group, transport)
+def test_port_channel(mpi_group: MpiGroup, nelem: int, connection_type: str, use_packet: bool):
+    group, connections = create_group_and_connection(mpi_group, connection_type)
 
     memory = GpuBuffer(nelem, dtype=cp.int32)
     if use_packet: