ruff formatting

Willy-Chan · Willy-Chan · commit d47123827541 · 2025-08-06T17:11:21.000-07:00
diff --git a/src/pplx_kernels/nvshmem.py b/src/pplx_kernels/nvshmem.py
@@ -7,7 +7,13 @@
 
 
 ###### NVSHMEM ######
-def nvshmem_init(global_rank: int, local_rank: int, world_size: int, device: Any, uid: Optional[Any] = None) -> None:
+def nvshmem_init(
+    global_rank: int,
+    local_rank: int,
+    world_size: int,
+    device: Any,
+    uid: Optional[Any] = None,
+) -> None:
     uniqueid = nvshmem.get_unique_id(empty=True)
     if local_rank == 0:
         uniqueid = nvshmem.get_unique_id()
@@ -18,7 +24,13 @@ def nvshmem_init(global_rank: int, local_rank: int, world_size: int, device: Any
     dist.broadcast_object_list(broadcast_objects, src=0)
     dist.barrier()
 
-    nvshmem.init(device=device, uid=broadcast_objects[0], rank=global_rank, nranks=world_size, initializer_method="uid")
+    nvshmem.init(
+        device=device,
+        uid=broadcast_objects[0],
+        rank=global_rank,
+        nranks=world_size,
+        initializer_method="uid",
+    )
 
 
 # This stream wrapper returns the format required by CUDA Python. This workaround will be removed when nvshmem4py supports Torch stream interoperability.
@@ -31,5 +43,3 @@ def __init__(self, pt_stream: Any) -> None:
     def __cuda_stream__(self) -> tuple[int, int]:
         stream_id = self.pt_stream.cuda_stream
         return (0, stream_id)
-
-
diff --git a/tests/bench_all_to_all.py b/tests/bench_all_to_all.py
@@ -119,8 +119,8 @@ def bench_all_to_all(
     )
     a2a_out_tensor = torch.empty_like(a2a_tensor)
 
-    nvshmem_in  =   nvshmem.tensor( a2a_shape, dtype=torch.uint8 )
-    nvshmem_out =   nvshmem.tensor( a2a_shape, dtype=torch.uint8 )
+    nvshmem_in = nvshmem.tensor(a2a_shape, dtype=torch.uint8)
+    nvshmem_out = nvshmem.tensor(a2a_shape, dtype=torch.uint8)
 
     # Compute stats
     dispatch_bytes = (
@@ -176,7 +176,9 @@ def run() -> tuple[float, ...]:
 
             e3.record(torch_stream_)
 
-            nvshmem.collective.alltoall(team, nvshmem_out, nvshmem_in, stream=torch_stream_wrapped)
+            nvshmem.collective.alltoall(
+                team, nvshmem_out, nvshmem_in, stream=torch_stream_wrapped
+            )
 
             e4.record(torch_stream_)
 
@@ -233,6 +235,7 @@ def run() -> tuple[float, ...]:
         result,
     )
 
+
 def _worker_bench_all_to_all(
     pgi: ProcessGroupInfo,
     dp_size: int,
@@ -246,7 +249,9 @@ def _worker_bench_all_to_all(
     dev = Device(local_rank)
     dev.set_current()
 
-    nvshmem_init(global_rank=global_rank, local_rank=local_rank, world_size=num_ranks, device=dev)
+    nvshmem_init(
+        global_rank=global_rank, local_rank=local_rank, world_size=num_ranks, device=dev
+    )
 
     in_dtype = getattr(torch, in_dtype_str)
     out_dtype = getattr(torch, out_dtype_str)
diff --git a/tests/test_all_to_all.py b/tests/test_all_to_all.py
@@ -283,6 +283,7 @@ def _do_test_all_to_all(
             ref_y[i_token] += rank_data.x[i_token].to(device).to(y.dtype) * val * weight
     torch.testing.assert_close(y[: rank_data.num_tokens], ref_y)
 
+
 def _worker_test_all_to_all(
     pgi: ProcessGroupInfo,
     dp_size: int,
@@ -300,9 +301,9 @@ def _worker_test_all_to_all(
     dev = Device(local_rank)
     dev.set_current()
 
-
-
-    nvshmem_init(global_rank=global_rank, local_rank=local_rank, world_size=num_ranks, device=dev)
+    nvshmem_init(
+        global_rank=global_rank, local_rank=local_rank, world_size=num_ranks, device=dev
+    )
 
     moe_config = dataclasses.replace(
         moe_config,
@@ -314,13 +315,16 @@ def _worker_test_all_to_all(
     if test_script_init_status < 2 and local_rank == 0:
         logger.warning(
             "NVSHMEM hostlib initialization incomplete - status: %d (rank: %d, local_rank: %d)",
-            test_script_init_status, global_rank, local_rank
+            test_script_init_status,
+            global_rank,
+            local_rank,
         )
 
     _do_test_all_to_all(pgi, dp_size, moe_config, internode, use_compile)
 
     nvshmem.finalize()
 
+
 @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="Requires at least 4 GPUs")
 @pytest.mark.parametrize("in_dtype", ["bfloat16", "float8_e4m3fn", "float16"])
 @pytest.mark.parametrize("out_dtype", ["float16", "bfloat16"])
diff --git a/tests/test_nvshmem.py b/tests/test_nvshmem.py
@@ -18,8 +18,8 @@
 
 logger = logging.getLogger(__name__)
 
-def test_nvshmem_1_gpu() -> None:
 
+def test_nvshmem_1_gpu() -> None:
     local_rank = 0
     rank_id = 0  # Define rank_id for single GPU test
 
@@ -35,7 +35,9 @@ def test_nvshmem_1_gpu() -> None:
     if test_script_init_status < 2 and local_rank == 0:
         logger.warning(
             "NVSHMEM hostlib initialization incomplete - status: %d (rank: %d, local_rank: %d)",
-            test_script_init_status, rank_id, local_rank
+            test_script_init_status,
+            rank_id,
+            local_rank,
         )
 
     assert nvshmem.my_pe() == 0
@@ -50,14 +52,21 @@ def _worker_test_nvshmem_4_gpu(pgi: ProcessGroupInfo) -> None:
     dev = Device(local_rank)
     dev.set_current()
 
-    nvshmem_init(global_rank=pgi.rank, local_rank=local_rank, world_size=pgi.world_size, device=dev)
+    nvshmem_init(
+        global_rank=pgi.rank,
+        local_rank=local_rank,
+        world_size=pgi.world_size,
+        device=dev,
+    )
 
     # Check host initialization status
     test_script_init_status = nvshmem.direct.init_status()
     if test_script_init_status < 2 and local_rank == 0:
         logger.warning(
             "NVSHMEM hostlib initialization incomplete - status: %d (rank: %d, local_rank: %d)",
-            test_script_init_status, pgi.rank, local_rank
+            test_script_init_status,
+            pgi.rank,
+            local_rank,
         )
 
     assert nvshmem.my_pe() == pgi.rank
@@ -80,21 +89,25 @@ def _worker_test_all_to_all(pgi: ProcessGroupInfo) -> None:
     num_ranks = dist.get_world_size()
     rank_id = dist.get_rank()
 
-    nvshmem_init(global_rank=rank_id, local_rank=local_rank, world_size=num_ranks, device=dev)
+    nvshmem_init(
+        global_rank=rank_id, local_rank=local_rank, world_size=num_ranks, device=dev
+    )
 
     # Check NVSHMEM host initialization status
     test_script_init_status = nvshmem.direct.init_status()
     if test_script_init_status < 2 and local_rank == 0:
         logger.warning(
             "NVSHMEM hostlib initialization incomplete - status: %d (rank: %d, local_rank: %d)",
-            test_script_init_status, rank_id, local_rank
+            test_script_init_status,
+            rank_id,
+            local_rank,
         )
 
     # all-to-all test
     try:
         # Allocate a PyTorch tensor backed by NVSHMEM symmetric memory
-        t_in = nvshmem.tensor( (pgi.world_size,), dtype=torch.int32 ).fill_(pgi.rank)
-        t_out = nvshmem.tensor( (pgi.world_size,), dtype=torch.int32 )
+        t_in = nvshmem.tensor((pgi.world_size,), dtype=torch.int32).fill_(pgi.rank)
+        t_out = nvshmem.tensor((pgi.world_size,), dtype=torch.int32)
 
         team = Teams.TEAM_WORLD
         nvshmem.collective.alltoall(team, t_out, t_in)