ROCm
diff --git a/‎iris/ccl/all_gather.py‎
Lines changed: 14 additions & 14 deletions b/‎iris/ccl/all_gather.py‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎iris/ccl/all_reduce.py‎
Lines changed: 36 additions & 36 deletions b/‎iris/ccl/all_reduce.py‎
Lines changed: 36 additions & 36 deletions
@@ -24,8 +24,8 @@ def persistent_all_gather(
     stride_out_m,
     stride_out_n,
     heap_bases: tl.tensor,
-    cur_rank: tl.constexpr,
-    cur_rank_global: tl.constexpr,
+    group_rank: tl.constexpr,
+    iris_rank: tl.constexpr,
     world_size: tl.constexpr,
     rank_start: tl.constexpr,
     rank_stride: tl.constexpr,
@@ -51,8 +51,8 @@ def persistent_all_gather(
         stride_in_m, stride_in_n: Strides for input tensor
         stride_out_m, stride_out_n: Strides for output tensor
         heap_bases: Heap base pointers for all ranks
-        cur_rank: Current rank within the group (for comparisons)
-        cur_rank_global: Rank within the `iris` instance
+        group_rank: Rank within the ProcessGroup (0 to group_size-1), used for tile assignment and comparisons
+        iris_rank: Rank in the iris context, used for iris RMA operations (heap_bases indexing)
         world_size: Total number of ranks in the group
         BLOCK_SIZE_M, BLOCK_SIZE_N: Block sizes for tiling
         GROUP_SIZE_M: Group size for M dimension tiling
@@ -104,15 +104,15 @@ def persistent_all_gather(
         data = tl.load(input_ptr_source, mask=input_mask, other=0.0)
 
         # Send local shard data to all destination ranks
-        # Each rank's input goes to output[cur_rank * M : (cur_rank + 1) * M, :] on all ranks
+        # Each rank's input goes to output[group_rank * M : (group_rank + 1) * M, :] on all ranks
         for i in tl.static_range(world_size):
             target_rank = rank_start + i * rank_stride
 
-            # Compute global output row indices: offset by cur_rank * M
-            rm_output = rm_input + cur_rank * M
+            # Compute global output row indices: offset by group_rank * M
+            rm_output = rm_input + group_rank * M
 
             # Output mask: only write where input was valid
-            output_mask = (rm_output[:, None] < (cur_rank + 1) * M) & (rn[None, :] < N)
+            output_mask = (rm_output[:, None] < (group_rank + 1) * M) & (rn[None, :] < N)
 
             # Combine masks: must be valid in both input and output
             combined_mask = input_mask & output_mask
@@ -124,16 +124,16 @@ def persistent_all_gather(
             output_ptr_target = output_ptr + output_offset
             output_ptr_target = tl.multiple_of(output_ptr_target, (BLOCK_SIZE_M, BLOCK_SIZE_N))
 
-            if i == cur_rank:
-                # Local destination (i == rank_in_group): use direct store
+            if i == group_rank:
+                # Local destination (i == group_rank): use direct store
                 tl.store(output_ptr_target, data, mask=combined_mask, cache_modifier=".wt")
             else:
                 # Remote destination: use iris.store to send data to remote destination
-                # Use cur_rank_global for iris IPC operations
+                # Use iris_rank for iris RMA operations (heap_bases indexing)
                 iris.store(
                     output_ptr_target,
                     data,
-                    cur_rank_global,
+                    iris_rank,
                     target_rank,
                     heap_bases,
                     mask=combined_mask,
@@ -183,8 +183,8 @@ def all_gather(
         )
 
     # Extract group information
-    # rank_in_group: position within the group (0, 1, 2, ...) - used for comparisons
-    # rank_global: global rank across all processes - used for iris RMA operations
+    # rank_in_group: position within the ProcessGroup (0, 1, 2, ...) - passed as group_rank to kernel
+    # rank_global: global rank in iris context - passed as iris_rank to kernel for RMA operations
     rank_in_group, rank_global, world_size, rank_start, rank_stride = extract_group_info(group, shmem)
 
     M, N = input_tensor.shape[:2]
 
@@ -138,8 +138,8 @@ def persistent_all_reduce_atomic(
     stride_out_m,
     stride_out_n,
     heap_bases: tl.tensor,
-    cur_rank: tl.constexpr,
-    cur_rank_global: tl.constexpr,
+    group_rank: tl.constexpr,
+    iris_rank: tl.constexpr,
     world_size: tl.constexpr,
     rank_start: tl.constexpr,
     rank_stride: tl.constexpr,
@@ -162,8 +162,8 @@ def persistent_all_reduce_atomic(
         M: Number of rows
         N: Number of columns
         heap_bases: Heap base pointers for all ranks
-        cur_rank: Current rank within the group (for comparisons)
-        cur_rank_global: Global rank (for iris IPC operations)
+        group_rank: Rank within the ProcessGroup (0 to group_size-1), used for tile assignment and comparisons
+        iris_rank: Rank in the iris context, used for iris RMA operations (heap_bases indexing)
         world_size: Total number of ranks in the group
     """
     pid = tl.program_id(0)
@@ -210,21 +210,21 @@ def persistent_all_reduce_atomic(
         data = tl.load(input_ptr_local, mask=mask)
 
         # Atomically add to output buffer on all ranks
-        # Each rank's output tensor is in its own heap, accessible via IPC
+        # Each rank's output tensor is in its own heap, accessible via RMA
         for i in range(world_size):
             target_rank = rank_start + i * rank_stride
-            if i == cur_rank:
-                # For the current rank (i == rank_in_group), use local atomic add
+            if i == group_rank:
+                # For the current rank (i == group_rank), use local atomic add
                 # output_ptr is already in current rank's address space
                 tl.atomic_add(output_ptr + output_offset, data, mask=mask)
             else:
                 # For remote ranks, use iris.atomic_add to translate pointer
-                # This accesses the remote rank's heap via IPC
-                # Use cur_rank_global for iris operations (heap_bases indexing)
+                # This accesses the remote rank's heap via RMA
+                # Use iris_rank for iris operations (heap_bases indexing)
                 iris.atomic_add(
                     output_ptr + output_offset,
                     data,
-                    cur_rank_global,
+                    iris_rank,
                     target_rank,
                     heap_bases,
                     mask=mask,
@@ -245,8 +245,8 @@ def persistent_all_reduce_spinlock(
     stride_out_m,
     stride_out_n,
     heap_bases: tl.tensor,
-    cur_rank: tl.constexpr,
-    cur_rank_global: tl.constexpr,
+    group_rank: tl.constexpr,
+    iris_rank: tl.constexpr,
     world_size: tl.constexpr,
     rank_start: tl.constexpr,
     rank_stride: tl.constexpr,
@@ -310,7 +310,7 @@ def persistent_all_reduce_spinlock(
             remote_rank = rank_start + i * rank_stride
             partial = iris.load(
                 input_ptr + input_offset,
-                cur_rank_global,
+                iris_rank,
                 remote_rank,
                 heap_bases,
                 mask=mask,
@@ -333,8 +333,8 @@ def persistent_all_reduce_one_shot(
     stride_out_m,
     stride_out_n,
     heap_bases: tl.tensor,
-    cur_rank: tl.constexpr,
-    cur_rank_global: tl.constexpr,
+    group_rank: tl.constexpr,
+    iris_rank: tl.constexpr,
     world_size: tl.constexpr,
     rank_start: tl.constexpr,
     rank_stride: tl.constexpr,
@@ -389,7 +389,7 @@ def persistent_all_reduce_one_shot(
             remote_rank = rank_start + i * rank_stride
             partial = iris.load(
                 input_ptr + input_offset,
-                cur_rank_global,
+                iris_rank,
                 remote_rank,
                 heap_bases,
                 mask=mask,
@@ -416,8 +416,8 @@ def persistent_all_reduce_ring(
     stride_out_m,
     stride_out_n,
     heap_bases: tl.tensor,
-    cur_rank: tl.constexpr,
-    cur_rank_global: tl.constexpr,
+    group_rank: tl.constexpr,
+    iris_rank: tl.constexpr,
     world_size: tl.constexpr,
     rank_start: tl.constexpr,
     rank_stride: tl.constexpr,
@@ -504,7 +504,7 @@ def persistent_all_reduce_ring(
                             remote_flag_ptr,
                             0,
                             0,
-                            cur_rank_global,
+                            iris_rank,
                             next_rank,
                             heap_bases,
                             sem="acquire",
@@ -517,7 +517,7 @@ def persistent_all_reduce_ring(
                     iris.store(
                         ring_buffer + tile_offset,
                         send_data,
-                        cur_rank_global,
+                        iris_rank,
                         next_rank,
                         heap_bases,
                         mask=mask,
@@ -526,7 +526,7 @@ def persistent_all_reduce_ring(
                     iris.atomic_xchg(
                         remote_flag_ptr,
                         1,
-                        cur_rank_global,
+                        iris_rank,
                         next_rank,
                         heap_bases,
                         sem="release",
@@ -560,8 +560,8 @@ def persistent_all_reduce_two_shot(
     stride_out_m,
     stride_out_n,
     heap_bases: tl.tensor,
-    cur_rank: tl.constexpr,
-    cur_rank_global: tl.constexpr,
+    group_rank: tl.constexpr,
+    iris_rank: tl.constexpr,
     world_size: tl.constexpr,
     rank_start: tl.constexpr,
     rank_stride: tl.constexpr,
@@ -586,13 +586,13 @@ def persistent_all_reduce_two_shot(
 
     tiles_per_rank = tl.cdiv(total_tiles, world_size)
     if DISTRIBUTION == 0:
-        start_tile = cur_rank
+        start_tile = group_rank
         stride = world_size
         remaining = total_tiles - start_tile
         remaining = tl.maximum(remaining, 0)
         max_tile_offset = tl.cdiv(remaining, stride)
     else:
-        start_tile = cur_rank * tiles_per_rank
+        start_tile = group_rank * tiles_per_rank
         stride = 1
         remaining = total_tiles - start_tile
         remaining = tl.maximum(remaining, 0)
@@ -636,11 +636,11 @@ def persistent_all_reduce_two_shot(
 
             start_rank_idx = pid % world_size
             start_rank_global = rank_start + start_rank_idx * rank_stride
-            acc = iris.load(base_ptr, cur_rank_global, start_rank_global, heap_bases).to(acc_dtype)
+            acc = iris.load(base_ptr, iris_rank, start_rank_global, heap_bases).to(acc_dtype)
             for i in tl.static_range(1, world_size):
                 remote_rank_idx = (start_rank_idx + i) % world_size
                 remote_rank = rank_start + remote_rank_idx * rank_stride
-                acc += iris.load(base_ptr, cur_rank_global, remote_rank, heap_bases).to(acc_dtype)
+                acc += iris.load(base_ptr, iris_rank, remote_rank, heap_bases).to(acc_dtype)
 
             reduced = acc.to(output_ptr.type.element_ty)
 
@@ -649,8 +649,8 @@ def persistent_all_reduce_two_shot(
             for i in tl.static_range(0, world_size):
                 remote_rank_idx = (start_rank_idx + i) % world_size
                 remote_rank = rank_start + remote_rank_idx * rank_stride
-                if remote_rank_idx != cur_rank:
-                    iris.store(out_ptr, reduced, cur_rank_global, remote_rank, heap_bases)
+                if remote_rank_idx != group_rank:
+                    iris.store(out_ptr, reduced, iris_rank, remote_rank, heap_bases)
 
         # Slow path: MASKED (only boundary tiles land here)
         # This path handles tiles at tensor boundaries where not all elements are valid.
@@ -659,11 +659,11 @@ def persistent_all_reduce_two_shot(
 
             start_rank_idx = pid % world_size
             start_rank_global = rank_start + start_rank_idx * rank_stride
-            acc = iris.load(base_ptr, cur_rank_global, start_rank_global, heap_bases, mask=mask).to(acc_dtype)
+            acc = iris.load(base_ptr, iris_rank, start_rank_global, heap_bases, mask=mask).to(acc_dtype)
             for i in tl.static_range(1, world_size):
                 remote_rank_idx = (start_rank_idx + i) % world_size
                 remote_rank = rank_start + remote_rank_idx * rank_stride
-                acc += iris.load(base_ptr, cur_rank_global, remote_rank, heap_bases, mask=mask).to(acc_dtype)
+                acc += iris.load(base_ptr, iris_rank, remote_rank, heap_bases, mask=mask).to(acc_dtype)
 
             reduced = acc.to(output_ptr.type.element_ty)
 
@@ -672,8 +672,8 @@ def persistent_all_reduce_two_shot(
             for i in tl.static_range(0, world_size):
                 remote_rank_idx = (start_rank_idx + i) % world_size
                 remote_rank = rank_start + remote_rank_idx * rank_stride
-                if remote_rank_idx != cur_rank:
-                    iris.store(out_ptr, reduced, cur_rank_global, remote_rank, heap_bases, mask=mask)
+                if remote_rank_idx != group_rank:
+                    iris.store(out_ptr, reduced, iris_rank, remote_rank, heap_bases, mask=mask)
 
 
 def all_reduce(
@@ -729,8 +729,8 @@ def all_reduce(
         )
 
     # Extract group information
-    # rank_in_group: position within the group (0, 1, 2, ...) - used for tile assignment and comparisons
-    # rank_global: global rank across all processes - used for iris IPC operations
+    # rank_in_group: position within the ProcessGroup (0, 1, 2, ...) - passed as group_rank to kernel
+    # rank_global: global rank in iris context - passed as iris_rank to kernel for RMA operations
     rank_in_group, rank_global, world_size, rank_start, rank_stride = extract_group_info(group, shmem)
     M, N = input_tensor.shape[:2]
 
@@ -843,7 +843,7 @@ def all_reduce(
             )
 
         # Calculate next rank in the ring for group support
-        # next_rank must be a global rank for iris IPC operations
+        # next_rank must be a global rank for iris RMA operations
         if group is None:
             # Simple case: next rank is just (rank_in_group + 1) % world_size (which equals global rank)
             next_rank = (rank_in_group + 1) % world_size