Adjust autotuning args

yzhangcs · yzhangcs · commit d146fa61f2c8 · 2025-03-15T11:39:26.000-07:00
diff --git a/3rdparty/flash-linear-attention b/3rdparty/flash-linear-attention
@@ -1 +1 @@
-Subproject commit d262316b96872e4b5b4112cbc639a46b48930292
+Subproject commit ffe659365bb8e2c1d75b8c8a27d32dec090a27f7
diff --git a/native_sparse_attention/ops/parallel.py b/native_sparse_attention/ops/parallel.py
@@ -7,7 +7,6 @@
 import torch
 import triton
 import triton.language as tl
-import triton.language.core as core
 from einops import rearrange
 
 from fla.ops.common.utils import (prepare_chunk_indices, prepare_chunk_offsets,
@@ -31,9 +30,8 @@
 })
 @triton.autotune(
     configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8]
-        for num_stages in [2, 3, 4, 5]
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
@@ -134,9 +132,8 @@ def parallel_nsa_compression_fwd_kernel(
 })
 @triton.autotune(
     configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8]
-        for num_stages in [2, 3, 4, 5]
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
@@ -238,7 +235,7 @@ def parallel_nsa_compression_bwd_kernel_dq(
 @triton.autotune(
     configs=[
         triton.Config({}, num_warps=num_warps)
-        for num_warps in [1, 2, 4, 8]
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
@@ -334,9 +331,8 @@ def parallel_nsa_compression_bwd_kernel_dkv(
 })
 @triton.autotune(
     configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8]
-        for num_stages in [2, 3, 4, 5]
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK'],
 )
@@ -443,8 +439,8 @@ def parallel_nsa_kernel_topk(
         b_i, b_ip = tl.sum(b_p, 0), b_i
         o_i, o_ip = tl.where(o_c <= i_t // BS, o_c + 1, 0), o_i
 
-        n_dims: core.constexpr = tl.standard._log2(b_i.shape[0])
-        for i in core.static_range(1, n_dims):
+        n_dims: tl.constexpr = tl.standard._log2(b_i.shape[0])
+        for i in tl.static_range(1, n_dims):
             b_i, o_i = _bitonic_merge(b_i, o_i.to(tl.int32), i, 2, n_dims)
 
         if i_c != 0:
@@ -469,7 +465,7 @@ def parallel_nsa_kernel_topk(
 @triton.autotune(
     configs=[
         triton.Config({}, num_warps=num_warps)
-        for num_warps in [1, 2, 4, 8]
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
@@ -613,9 +609,8 @@ def parallel_nsa_bwd_kernel_preprocess(
 })
 @triton.autotune(
     configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8]
-        for num_stages in [2, 3, 4, 5]
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
@@ -721,9 +716,8 @@ def parallel_nsa_bwd_kernel_dq(
 })
 @triton.autotune(
     configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8]
-        for num_stages in [2, 3, 4, 5]
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
diff --git a/native_sparse_attention/ops/utils.py b/native_sparse_attention/ops/utils.py
@@ -6,74 +6,67 @@
 
 # Code adapted from https://github.com/triton-lang/triton/issues/3698#issuecomment-2067681396
 
-
 import triton
-import triton.language.core as core
-from triton.language.standard import _log2, sum, zeros_like
+import triton.language as tl
 
 
 @triton.jit
 def _compare_and_swap(
     x,
     ids,
     flip,
-    i: core.constexpr,
-    n_dims: core.constexpr,
+    i: tl.constexpr,
+    n_dims: tl.constexpr,
 ):
-    n_outer: core.constexpr = x.numel >> n_dims
-    shape: core.constexpr = [n_outer * 2**i, 2, 2**(n_dims - i - 1)]
-    y = core.reshape(x, shape)
+    n_outer: tl.constexpr = x.numel >> n_dims
+    shape: tl.constexpr = [n_outer * 2**i, 2, 2**(n_dims - i - 1)]
+    y = tl.reshape(x, shape)
     # slice left/right with 'stride' 2**(n_dims - i - 1)
-    mask = core.arange(0, 2)[None, :, None]
-    left = core.broadcast_to(sum(y * (1 - mask), 1)[:, None, :], shape).to(y.dtype)
-    right = core.broadcast_to(sum(y * mask, 1)[:, None, :], shape).to(y.dtype)
-    left = core.reshape(left, x.shape)
-    right = core.reshape(right, x.shape)
+    mask = tl.arange(0, 2)[None, :, None]
+    left = tl.broadcast_to(tl.sum(y * (1 - mask), 1)[:, None, :], shape).to(y.dtype)
+    right = tl.broadcast_to(tl.sum(y * mask, 1)[:, None, :], shape).to(y.dtype)
+    left = tl.reshape(left, x.shape)
+    right = tl.reshape(right, x.shape)
     # idx
-    y_idx = core.reshape(ids, shape)
-    left_idx = core.broadcast_to(sum(y_idx * (1 - mask), 1)[:, None, :], shape)
-    right_idx = core.broadcast_to(sum(y_idx * mask, 1)[:, None, :], shape)
-    left_idx = core.reshape(left_idx, x.shape).to(y_idx.dtype)
-    right_idx = core.reshape(right_idx, x.shape).to(y_idx.dtype)
+    y_idx = tl.reshape(ids, shape)
+    left_idx = tl.broadcast_to(tl.sum(y_idx * (1 - mask), 1)[:, None, :], shape)
+    right_idx = tl.broadcast_to(tl.sum(y_idx * mask, 1)[:, None, :], shape)
+    left_idx = tl.reshape(left_idx, x.shape).to(y_idx.dtype)
+    right_idx = tl.reshape(right_idx, x.shape).to(y_idx.dtype)
     # actual compare-and-swap
-    idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
+    idtype = tl.core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
     ileft = left.to(idtype, bitcast=True)
     iright = right.to(idtype, bitcast=True)
     ix = x.to(idtype, bitcast=True)
 
     cond = (left > right) != flip
-    ret = ix ^ core.where(cond, ileft ^ iright, zeros_like(ix))
-    new_ids = ids ^ core.where(cond, left_idx ^ right_idx, zeros_like(ids))
+    ret = ix ^ tl.where(cond, ileft ^ iright, tl.zeros_like(ix))
+    new_ids = ids ^ tl.where(cond, left_idx ^ right_idx, tl.zeros_like(ids))
     return ret.to(x.dtype, bitcast=True), new_ids
 
 
 @triton.jit
 def _bitonic_merge(
     x,
     ids,
-    stage: core.constexpr,
-    order: core.constexpr,
-    n_dims: core.constexpr,
+    stage: tl.constexpr,
+    order: tl.constexpr,
+    n_dims: tl.constexpr,
 ):
-    '''
-    order_type 0 == ascending
-    order_type 1 == descending
-    order_type 2 == alternating
-    '''
-    n_outer: core.constexpr = x.numel >> n_dims
-    core.static_assert(stage <= n_dims)
+    n_outer: tl.constexpr = x.numel >> n_dims
+    tl.static_assert(stage <= n_dims)
     # flip denotes whether to re-arrange sub-sequences of elements in ascending or
     # descending order.
     # if flip = 00000000... then all elements will be re-arranged ascendingly at this stage
     # if flip = 00110011... then all the elements will be re-arranged alternatingly (with
     # a stride of 2) at this stage
     if order == 2:
-        shape: core.constexpr = [n_outer * 2**(n_dims - 1 - stage), 2, 2**stage]
-        flip = core.reshape(core.broadcast_to(core.arange(0, 2)[None, :, None], shape), x.shape)
+        shape: tl.constexpr = [n_outer * 2**(n_dims - 1 - stage), 2, 2**stage]
+        flip = tl.reshape(tl.broadcast_to(tl.arange(0, 2)[None, :, None], shape), x.shape)
     else:
         flip = order
     # perform `stage` rounds of `compare-and-swap`
-    for i in core.static_range(stage):
+    for i in tl.static_range(stage):
         x, ids = _compare_and_swap(x, ids, flip, i + (n_dims - stage), n_dims)
     return x, ids
 
@@ -82,15 +75,15 @@ def _bitonic_merge(
 def argsort(
     x,
     ids,
-    dim: core.constexpr = None,
-    descending: core.constexpr = core.CONSTEXPR_0,
+    dim: tl.constexpr = None,
+    descending: tl.constexpr = tl.core.CONSTEXPR_0,
 ):
     # handle default dimension or check that it is the most minor dim
-    _dim: core.constexpr = len(x.shape) - 1 if dim is None else dim
-    core.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
+    _dim: tl.constexpr = len(x.shape) - 1 if dim is None else dim
+    tl.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
     # iteratively run bitonic merge-sort steps
-    n_dims: core.constexpr = _log2(x.shape[_dim])
+    n_dims: tl.constexpr = tl.log2(x.shape[_dim])
 
-    for i in core.static_range(1, n_dims + 1):
+    for i in tl.static_range(1, n_dims + 1):
         x, ids = _bitonic_merge(x, ids, i, 2 if i < n_dims else descending, n_dims)
     return x, ids