SandAI-org
diff --git a/‎.github/workflows/build_test.yaml‎
Lines changed: 6 additions & 3 deletions b/‎.github/workflows/build_test.yaml‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎magi_attention/comm/primitive/grpcoll/utils.py‎
Lines changed: 5 additions & 5 deletions b/‎magi_attention/comm/primitive/grpcoll/utils.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎magi_attention/common/range_op/_range_gather.py‎
Lines changed: 134 additions & 33 deletions b/‎magi_attention/common/range_op/_range_gather.py‎
Lines changed: 134 additions & 33 deletions
diff --git a/‎magi_attention/common/range_op/utils.py‎
Lines changed: 4 additions & 4 deletions b/‎magi_attention/common/range_op/utils.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎magi_attention/testing/dist_common.py‎
Lines changed: 5 additions & 2 deletions b/‎magi_attention/testing/dist_common.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎magi_attention/testing/precision.py‎
Lines changed: 24 additions & 8 deletions b/‎magi_attention/testing/precision.py‎
Lines changed: 24 additions & 8 deletions
@@ -50,15 +50,18 @@ jobs:
           filters: |
             MagiAttention:
               - 'magi_attention/**'
+            MagiAttentionCsrc:
+              - 'magi_attention/csrc/**'
       - name: print filter results
         run: |
           echo "is MagiAttention modified: ${{ steps.filter.outputs.MagiAttention }}"
+          echo "is MagiAttention csrc modified: ${{ steps.filter.outputs.MagiAttentionCsrc }}"
 
   install_MagiAttention_ngc2505_cuda12:
     needs: [detect_changes]
     if: |
       always() &&
-      needs.detect_changes.outputs.MagiAttention == 'true'
+      needs.detect_changes.outputs.MagiAttentionCsrc == 'true'
     runs-on: [self-hosted]
     container:
       image: registry.cn-sh-01.sensecore.cn/sandai-ccr/magi-base:25.05.4
@@ -101,11 +104,11 @@ jobs:
       always() &&
       (
         needs.detect_changes.outputs.MagiAttention == 'true' &&
-        needs.install_MagiAttention_ngc2505_cuda12.result == 'success'
+        (needs.detect_changes.outputs.MagiAttentionCsrc != 'true' || needs.install_MagiAttention_ngc2505_cuda12.result == 'success')
       )
     runs-on: [self-hosted]
     container:
-      image: registry.cn-sh-01.sensecore.cn/sandai-ccr/magi-base:25.10.2
+      image: registry.cn-sh-01.sensecore.cn/sandai-ccr/magi-base:25.10.3
       options: --gpus all --ipc host
       credentials:
         username: ${{ secrets.DOCKER_USER_NAME }}
 
@@ -102,7 +102,7 @@ def _calc_range_gather_kwargs_from_ranges_with_rank(
     total_size = sum(range_sizes)
 
     # calculate row_map from row idx to range idx
-    range_sizes = torch.tensor([0] + range_sizes, dtype=torch.int32, device=device)
+    range_sizes = torch.tensor([0] + range_sizes, dtype=torch.int64, device=device)
     row_map = torch.repeat_interleave(
         torch.arange(0, len(ranges), device=device),
         range_sizes[1:],
@@ -141,7 +141,7 @@ def _calc_unperm_range_gather_kwargs_from_split_size_list(
     range_sizes = [end - start for start, end in ranges]
     range_sizes = torch.tensor(
         [0] + range_sizes,
-        dtype=torch.int32,
+        dtype=torch.int64,
         device=device,
     )
 
@@ -186,7 +186,7 @@ def _calc_range_reduce_kwargs_from_ranges(
             total_size += reduce_end - reduce_start
 
     range_reduce_kwargs: dict[str, Any] = {"deterministic": deterministic}
-    input_ranges = torch.tensor(input_ranges, dtype=torch.int32, device=device)
+    input_ranges = torch.tensor(input_ranges, dtype=torch.int64, device=device)
     range_reduce_kwargs["input_ranges"] = input_ranges
 
     if deterministic:
@@ -214,7 +214,7 @@ def _calc_range_reduce_kwargs_from_ranges(
         range_reduce_kwargs["out2inp_range_map"] = out2inp_range_map
         range_reduce_kwargs["unique_ordered_out_ranges"] = unique_ordered_out_ranges
     else:
-        range_sizes = torch.tensor([0] + range_sizes, dtype=torch.int32, device=device)
+        range_sizes = torch.tensor([0] + range_sizes, dtype=torch.int64, device=device)
         cu_range_sizes = torch.cumsum(range_sizes, dim=0)
         row_map = torch.repeat_interleave(
             torch.arange(0, input_ranges.shape[0], device=device),
@@ -227,7 +227,7 @@ def _calc_range_reduce_kwargs_from_ranges(
     range_reduce_kwargs["total_size"] = total_size
     range_reduce_kwargs["row_map"] = row_map
 
-    output_ranges = torch.tensor(output_ranges, dtype=torch.int32, device=device)
+    output_ranges = torch.tensor(output_ranges, dtype=torch.int64, device=device)
     range_reduce_kwargs["output_ranges"] = output_ranges
 
     return range_reduce_kwargs
 
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Literal, TypeAlias
+
 import torch
 import triton
 import triton.language as tl
@@ -23,8 +25,61 @@
 __all__ = ["range_gather"]
 
 
+RangeGatherKernelBackend: TypeAlias = Literal["per_row", "per_range"]
+
+
+@triton.jit
+def range_gather_per_range_kernel(
+    input_ptr,
+    output_ptr,
+    ranges_ptr,
+    cu_range_sizes_ptr,
+    input_stride,
+    output_stride,
+    N_PER_ROW: tl.constexpr,
+    ROWS_PER_BLOCK: tl.constexpr,
+    UNROLL_FACTOR: tl.constexpr = 4,
+):
+    range_idx = tl.program_id(0)
+    cu_range_size = tl.load(cu_range_sizes_ptr + range_idx)
+    range_start = tl.load(ranges_ptr + range_idx * 2)
+    range_end = tl.load(ranges_ptr + range_idx * 2 + 1)
+    range_size = range_end - range_start
+
+    num_row_blocks = (range_size + ROWS_PER_BLOCK - 1) // ROWS_PER_BLOCK
+    row_offs = tl.arange(0, ROWS_PER_BLOCK)[:, None]
+    col_offs = tl.arange(0, N_PER_ROW)[None, :]
+    input_offs = (row_offs * input_stride) + col_offs
+    output_offs = (row_offs * output_stride) + col_offs
+    col_mask = (col_offs < input_stride) & (col_offs < output_stride)
+
+    inp_idx = range_start * input_stride
+    out_idx = cu_range_size * output_stride
+    curr_inp_ptr = input_ptr + inp_idx
+    curr_out_ptr = output_ptr + out_idx
+
+    for row_block_idx in tl.range(num_row_blocks, loop_unroll_factor=UNROLL_FACTOR):
+        row_start = row_block_idx * ROWS_PER_BLOCK
+        inp_ptr_this_block = curr_inp_ptr + row_start * input_stride
+        out_ptr_this_block = curr_out_ptr + row_start * output_stride
+
+        row_mask = row_offs + row_start < range_size
+        mask = row_mask & col_mask
+
+        inp = tl.load(
+            inp_ptr_this_block + input_offs,
+            mask=mask,
+        )
+        tl.store(
+            out_ptr_this_block + output_offs,
+            inp,
+            mask=mask,
+            cache_modifier=".cs",  # cache streaming, since accessed once
+        )
+
+
 @triton.jit
-def range_gather_kernel(
+def range_gather_per_row_kernel(
     input_ptr,
     output_ptr,
     ranges_ptr,
@@ -110,14 +165,28 @@ def range_gather(
     # sanity check
     assert cu_range_sizes.size(0) == ranges.size(0) + 1
 
-    # Calculate row_map if not provided
-    row_map = kwargs.pop("row_map", None)
-    if row_map is None:
-        row_map = _calc_ranges_row_map(ranges, total_size)
-    else:
-        row_map = row_map.contiguous()
-    # sanity check
-    assert row_map.size(0) == total_size
+    # Determine which kernel to use
+    kernel_backend: RangeGatherKernelBackend | None = kwargs.pop("kernel_backend", None)
+    if kernel_backend is None:  # auto dispatch
+        # heuristic: default use per-row kernel when hidden size per row is non-trivially small
+        # TODO: refine the heuristic for better performance
+        hidden_size_per_row = (
+            input.numel() // input.shape[0] if input.shape[0] > 0 else 0
+        )
+        if hidden_size_per_row >= 128:
+            kernel_backend = "per_row"
+        else:
+            kernel_backend = "per_range"
+
+    # Calculate row_map if not provided but required
+    if kernel_backend == "per_row":
+        row_map = kwargs.pop("row_map", None)
+        if row_map is None:
+            row_map = _calc_ranges_row_map(ranges, total_size)
+        else:
+            row_map = row_map.contiguous()
+        # sanity check
+        assert row_map.size(0) == total_size
 
     # ---   pre-process input/output   --- #
 
@@ -145,30 +214,62 @@ def range_gather(
     input_stride = input.stride(0)
     output_stride = output.stride(0)
 
-    # ---   calculate grid size   --- #
-
-    M = total_size
-    N = input.numel() // input.shape[0]
-
-    ELEM_PER_BLOCK = 2048 // input.element_size()
-    N_BLOCK = triton.cdiv(N, ELEM_PER_BLOCK)
-
-    grid = (M, N_BLOCK)
-
-    # ---   launch kernel   --- #
-
-    range_gather_kernel[grid](
-        input,
-        output,
-        ranges,
-        cu_range_sizes,
-        row_map,
-        input_stride,
-        output_stride,
-        N,
-        N_BLOCK,
-        ELEM_PER_BLOCK,
-    )
+    match kernel_backend:
+        case "per_row":
+            # ---   calculate grid size   --- #
+
+            M = total_size
+            N = input.numel() // input.shape[0]
+
+            ELEM_PER_BLOCK = 2048 // input.element_size()  # heuristic
+            N_BLOCK = triton.cdiv(N, ELEM_PER_BLOCK)
+
+            grid = (M, N_BLOCK)
+
+            # ---   launch kernel   --- #
+
+            range_gather_per_row_kernel[grid](
+                input,
+                output,
+                ranges,
+                cu_range_sizes,
+                row_map,
+                input_stride,
+                output_stride,
+                N,
+                N_BLOCK,
+                ELEM_PER_BLOCK,
+                num_warps=4,  # block_size=128
+            )
+        case "per_range":
+            # ---   calculate grid size   --- #
+
+            M = ranges.shape[0]
+            grid = (M,)  # type: ignore[assignment]
+
+            N_PER_ROW = triton.next_power_of_2(
+                max(input_stride, output_stride)
+            )  # heuristic
+            avg_range_size = (total_size + M - 1) // M
+            ROWS_PER_BLOCK = max(
+                1, min(triton.next_power_of_2(avg_range_size // 2), 4096)
+            )  # heuristic
+
+            # ---   launch kernel   --- #
+
+            range_gather_per_range_kernel[grid](
+                input,
+                output,
+                ranges,
+                cu_range_sizes,
+                input_stride,
+                output_stride,
+                N_PER_ROW,
+                ROWS_PER_BLOCK,
+                num_warps=8,  # block_size=256
+            )
+        case _:
+            raise ValueError(f"Unsupported kernel_backend: {kernel_backend}")
 
     # ---   post-process output   --- #
 
 
@@ -30,7 +30,7 @@ def _calc_cu_range_sizes(
         total_size += end - start
         cu_range_sizes.append(total_size)
 
-    cu_range_sizes = torch.tensor(cu_range_sizes, dtype=torch.int32, device=device)
+    cu_range_sizes = torch.tensor(cu_range_sizes, dtype=torch.int64, device=device)
 
     return cu_range_sizes, total_size
 
@@ -40,7 +40,7 @@ def _calc_ranges_row_map(
     total_size: int,
 ) -> torch.Tensor:
     if ranges.shape[0] == 0:
-        return torch.empty(0, dtype=torch.int32, device=ranges.device)
+        return torch.empty(0, dtype=torch.int64, device=ranges.device)
 
     row_map = torch.arange(0, ranges.shape[0], device=ranges.device)
     range_sizes = ranges[:, 1] - ranges[:, 0]
@@ -82,10 +82,10 @@ def _calc_out2inp_range_map(
         out2inp_range_map.append(inp_range_list)
 
     out2inp_range_map = torch.tensor(
-        out2inp_range_map, dtype=torch.int32, device=device
+        out2inp_range_map, dtype=torch.int64, device=device
     )
     unique_ordered_out_ranges = torch.tensor(
-        unique_ordered_out_ranges, dtype=torch.int32, device=device
+        unique_ordered_out_ranges, dtype=torch.int64, device=device
     )
 
     return out2inp_range_map, unique_ordered_out_ranges, max_inp_indices_size
@@ -79,6 +79,7 @@ def init_pg(self) -> None:
         ]:
             raise RuntimeError(f"Backend {self.backend} not supported!")
 
+        # Initialize the process group
         dist.init_process_group(
             backend=self.backend,
             world_size=self.world_size,
@@ -87,10 +88,13 @@ def init_pg(self) -> None:
             timeout=datetime.timedelta(minutes=30),
         )
 
-        # set device for nccl pg for collectives
+        # Set the device for this process
         if "nccl" in self.backend:
             torch.cuda.set_device(self.rank)
 
+        # Set random seed with rank offset
+        self._set_random_seed()
+
     def destroy_pg(self) -> None:
         # Wait for all ranks to reach here before starting shutdown.
         # FIXME dist.barrier deadlocks with multiple threads and NCCL: https://github.com/pytorch/pytorch/issues/95895
@@ -112,7 +116,6 @@ def setUp(self) -> None:
             TIMEOUT_OVERRIDE.update({self.id().split(".")[-1]: timeout})
 
         self._spawn_processes()
-        self._set_random_seed()
 
 
 TestFunc = Callable[..., Any]
 
@@ -15,6 +15,7 @@
 import re
 
 import torch
+import torch.distributed as dist
 from packaging import version
 
 from magi_attention.functional.utils import safe_subtract
@@ -95,17 +96,35 @@ def assert_close(
     rtol: float = 1e-5,
     mismatch_threshold: float = 0,
     test_case: str = "",
+    print_rank: int = 0,
 ) -> None:
+    """Assert that two tensors are close within given tolerances,
+    with a mismatch threshold to allow some degree of mismatch.
+
+    Args:
+        a (torch.Tensor): tensor a.
+        b (torch.Tensor): tensor b.
+        atol (float, optional): absolute tolerance. Defaults to ``1e-5``.
+        rtol (float, optional): relative tolerance. Defaults to ``1e-5``.
+        mismatch_threshold (float, optional): allowed mismatch threshold. Defaults to ``0``.
+        test_case (str, optional): test case description. Defaults to "".
+        print_rank (int, optional): rank to print from. Defaults to ``0``.
+            And set to ``-1`` to print from all ranks.
+    """
     assert (
         0 <= mismatch_threshold <= 1
     ), f"{mismatch_threshold=} must be between 0 and 1"
+
+    if dist.is_initialized():
+        rank = dist.get_rank()
+        is_this_print_rank = print_rank == -1 or rank == print_rank
+    else:
+        is_this_print_rank = True
+
     try:
         torch.testing.assert_close(a, b, atol=atol, rtol=rtol)
         no_mismatch_info = f"[{test_case}]: has no mismatch"
-        if torch.distributed.is_initialized():
-            if torch.distributed.get_rank() == 0:
-                print(no_mismatch_info)
-        else:
+        if is_this_print_rank:
             print(no_mismatch_info)
     except AssertionError as e:
         error_msg = str(e)
@@ -119,10 +138,7 @@ def assert_close(
         )
 
         if mismatch_ratio <= mismatch_threshold:
-            if torch.distributed.is_initialized():
-                if torch.distributed.get_rank() == 0:
-                    print(mismatch_info)
-            else:
+            if is_this_print_rank:
                 print(mismatch_info)
             return
         else: