address copilot feedback

micmelesse · micmelesse · commit 75ec61f3b87f · 2026-03-11T17:10:02.000-04:00
diff --git a/iris/_distributed_helpers.py b/iris/_distributed_helpers.py
@@ -248,12 +248,11 @@ def extract_group_info(group, rank, num_ranks):
 
     group_ranks = dist.get_process_group_ranks(group)
     world_size = len(group_ranks)
-    rank_global = dist.get_rank()
+    rank_global = rank
 
     if rank_global not in group_ranks:
         raise RuntimeError(
-            f"Current rank {rank_global} is not part of the specified process group. "
-            f"Group contains ranks: {group_ranks}"
+            f"Rank {rank_global} is not part of the specified process group. Group contains ranks: {group_ranks}"
         )
 
     rank_in_group = group_ranks.index(rank_global)
@@ -315,16 +314,17 @@ def _device_barrier_kernel(
     MAX_SPINS: tl.constexpr = 1_000_000_000,
 ):
     """
-    Stateless device-side barrier using atomic operations on the symmetric heap.
+    Device-side barrier using atomic operations on the symmetric heap.
+    CUDA graph capturable.
+
+    Stateless w.r.t. host-side epoch tracking: there is no CPU-side epoch
+    counter. Each rank's flag on the heap serves as its own epoch counter,
+    managed entirely by the GPU via atomic_add. A persistent per-group flags
+    tensor is cached in ``_device_barrier_state``.
 
     Launched with grid=(1,). A single CTA:
     1. Atomically increments its own flag (atomic_add, release)
     2. Serially polls each remote rank's flag for the same value (acquire)
-
-    No CPU-side epoch tracking. Each rank's flag IS the epoch, managed
-    entirely on the GPU via atomic_add. This makes the barrier safe for
-    CUDA graph capture: during recording the kernel is just recorded,
-    during replay all ranks increment together.
     """
     # Increment own flag and determine target
     own_flag_ptr = flags_ptr + iris_rank
@@ -355,15 +355,17 @@ def _device_barrier_kernel(
 
 def distributed_device_barrier(flags, group, rank, num_ranks, heap_bases):
     """
-    Stateless device-side barrier using atomic operations on the symmetric heap.
+    Device-side barrier using atomic operations on the symmetric heap.
+    CUDA graph capturable.
 
     Unlike ``distributed_barrier`` which uses host-side ``torch.distributed.barrier()``,
     this launches a single-CTA Triton kernel that synchronizes via
     device-side atomics, making it safe to use during CUDA graph capture.
 
-    No CPU-side epoch tracking is needed. Each rank's flag on the symmetric
-    heap serves as its own epoch counter, managed entirely by the GPU via
-    atomic_add.
+    Stateless w.r.t. host-side epoch tracking: each rank's flag on the
+    symmetric heap serves as its own epoch counter, managed entirely by
+    the GPU via atomic_add. A persistent per-group flags tensor is cached
+    in ``_device_barrier_state``.
 
     Args:
         flags: int32 tensor on symmetric heap, one element per rank.
diff --git a/iris/iris.py b/iris/iris.py
@@ -996,12 +996,14 @@ def barrier(self, stream=None, group=None):
 
     def device_barrier(self, group=None):
         """
-        Stateless device-side barrier that is CUDA graph capturable.
+        Device-side barrier that is CUDA graph capturable.
 
         Unlike ``barrier()`` which uses host-side ``torch.distributed.barrier()``,
         this uses device-side atomic operations on the symmetric heap to synchronize
-        ranks. No CPU-side epoch tracking -- each rank's flag on the heap serves
-        as its own epoch counter, managed entirely by the GPU via atomic_add.
+        ranks. Stateless w.r.t. host-side epoch tracking: each rank's flag on
+        the heap serves as its own epoch counter, managed entirely by the GPU
+        via atomic_add. A persistent per-group flags tensor is cached in
+        ``_device_barrier_state``.
 
         Args:
             group (ProcessGroup, optional): The process group to synchronize.
diff --git a/tests/unittests/test_barriers.py b/tests/unittests/test_barriers.py
@@ -55,22 +55,23 @@ def _write_remote_kernel(
 @pytest.mark.parametrize("barrier_type", BARRIER_TYPES)
 def test_barrier_basic(barrier_type, n):
     shmem = iris.iris(1 << 20)
-    shmem.barrier()
+    _call_barrier(shmem, barrier_type)
 
     try:
         for _ in range(n):
             _call_barrier(shmem, barrier_type)
     finally:
-        shmem.barrier()
+        _call_barrier(shmem, barrier_type)
         del shmem
         gc.collect()
 
 
 @pytest.mark.parametrize("n", [1, 2, 5, 10])
-def test_barrier_state_reuse(n):
+@pytest.mark.parametrize("barrier_type", BARRIER_TYPES)
+def test_barrier_state_reuse(barrier_type, n):
     """Verify device barrier reuses the same flags tensor across calls."""
     shmem = iris.iris(1 << 20)
-    shmem.barrier()
+    _call_barrier(shmem, barrier_type)
 
     try:
         shmem.device_barrier()
@@ -82,7 +83,7 @@ def test_barrier_state_reuse(n):
             shmem.device_barrier()
             assert shmem._device_barrier_state[None].data_ptr() == flags_ptr
     finally:
-        shmem.barrier()
+        _call_barrier(shmem, barrier_type)
         del shmem
         gc.collect()
 
@@ -161,13 +162,13 @@ def _cross_rank_graph(
     buf,
     result,
 ):
-    stream = torch.cuda.Stream()
+    capture_stream = torch.cuda.Stream()
 
     if op == "load":
         buf.fill_(float(rank))
 
         # Warmup on capture stream.
-        with torch.cuda.stream(stream):
+        with torch.cuda.stream(capture_stream):
             for _ in range(num_barriers):
                 shmem.device_barrier()
             _read_remote_kernel[(1,)](
@@ -180,11 +181,11 @@ def _cross_rank_graph(
             )
             for _ in range(num_barriers):
                 shmem.device_barrier()
-        stream.synchronize()
+        capture_stream.synchronize()
 
         # Capture.
         graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph, stream=stream):
+        with torch.cuda.graph(graph, stream=capture_stream):
             for _ in range(num_barriers):
                 shmem.device_barrier()
             _read_remote_kernel[(1,)](
@@ -201,11 +202,11 @@ def _cross_rank_graph(
         # Replay with fresh data.
         for i in range(rounds):
             val = float(rank + (i + 1) * 10)
-            buf.fill_(val)
-            shmem.device_barrier()
-
-            graph.replay()
-            stream.synchronize()
+            with torch.cuda.stream(capture_stream):
+                buf.fill_(val)
+                shmem.device_barrier()
+                graph.replay()
+            capture_stream.synchronize()
 
             expected = torch.full(
                 (N,),
@@ -218,7 +219,7 @@ def _cross_rank_graph(
         buf.fill_(0.0)
 
         # Warmup on capture stream.
-        with torch.cuda.stream(stream):
+        with torch.cuda.stream(capture_stream):
             for _ in range(num_barriers):
                 shmem.device_barrier()
             _write_remote_kernel[(1,)](
@@ -231,11 +232,11 @@ def _cross_rank_graph(
             )
             for _ in range(num_barriers):
                 shmem.device_barrier()
-        stream.synchronize()
+        capture_stream.synchronize()
 
         # Capture.
         graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph, stream=stream):
+        with torch.cuda.graph(graph, stream=capture_stream):
             for _ in range(num_barriers):
                 shmem.device_barrier()
             _write_remote_kernel[(1,)](
@@ -251,13 +252,15 @@ def _cross_rank_graph(
 
         # Replay and verify.
         for _ in range(rounds):
-            buf.fill_(0.0)
-            shmem.device_barrier()
-
-            graph.replay()
-            stream.synchronize()
+            with torch.cuda.stream(capture_stream):
+                buf.fill_(0.0)
+                shmem.device_barrier()
+                graph.replay()
+            capture_stream.synchronize()
 
-            shmem.device_barrier()
+            with torch.cuda.stream(capture_stream):
+                shmem.device_barrier()
+            capture_stream.synchronize()
             expected = torch.full((N,), float(writer), dtype=torch.float32, device="cuda")
             torch.testing.assert_close(buf, expected, rtol=0, atol=0)
 
@@ -288,7 +291,7 @@ def test_barrier_cross_rank(barrier_type, op, mode, num_barriers, N, rounds=3):
         )
 
     shmem = iris.iris(1 << 20)
-    shmem.barrier()
+    _call_barrier(shmem, barrier_type)
     rank = shmem.get_rank()
     num_ranks = shmem.get_num_ranks()
     heap_bases = shmem.get_heap_bases()
@@ -332,12 +335,13 @@ def test_barrier_cross_rank(barrier_type, op, mode, num_barriers, N, rounds=3):
                     result,
                 )
     finally:
-        shmem.barrier()
+        _call_barrier(shmem, barrier_type)
         del shmem
         gc.collect()
 
 
-def test_barrier_timeout_assert():
+@pytest.mark.parametrize("barrier_type", BARRIER_TYPES)
+def test_barrier_timeout_assert(barrier_type):
     """Verify device_barrier asserts on timeout instead of hanging forever.
 
     Only rank 0 calls the barrier kernel. Other ranks skip it, so rank 0
@@ -351,7 +355,7 @@ def test_barrier_timeout_assert():
     if num_ranks < 2:
         pytest.skip("Need at least 2 ranks")
 
-    shmem.barrier()
+    _call_barrier(shmem, barrier_type)
 
     flags = shmem._device_barrier_state.setdefault(None, shmem.zeros((num_ranks,), dtype=torch.int32))
 
@@ -370,6 +374,8 @@ def test_barrier_timeout_assert():
             with pytest.raises(RuntimeError, match="device-side assert"):
                 torch.cuda.synchronize()
     finally:
-        shmem.barrier()
+        # No barrier here: rank 0's GPU is dead after the intentional
+        # device-side assert. Any GPU sync (NCCL or device_barrier)
+        # will hang or crash.
         del shmem
         gc.collect()