Adjust autotuning args

yzhangcs · yzhangcs · commit b1d78f55be9f · 2025-03-15T11:36:47.000-07:00
diff --git a/3rdparty/flash-linear-attention b/3rdparty/flash-linear-attention
@@ -1 +1 @@
-Subproject commit d262316b96872e4b5b4112cbc639a46b48930292
+Subproject commit ffe659365bb8e2c1d75b8c8a27d32dec090a27f7
diff --git a/native_sparse_attention/ops/parallel.py b/native_sparse_attention/ops/parallel.py
@@ -7,7 +7,6 @@
 import torch
 import triton
 import triton.language as tl
-import triton.language.core as core
 from einops import rearrange
 
 from fla.ops.common.utils import (prepare_chunk_indices, prepare_chunk_offsets,
@@ -31,9 +30,8 @@
 })
 @triton.autotune(
     configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8]
-        for num_stages in [2, 3, 4, 5]
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
@@ -134,9 +132,8 @@ def parallel_nsa_compression_fwd_kernel(
 })
 @triton.autotune(
     configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8]
-        for num_stages in [2, 3, 4, 5]
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
@@ -238,7 +235,7 @@ def parallel_nsa_compression_bwd_kernel_dq(
 @triton.autotune(
     configs=[
         triton.Config({}, num_warps=num_warps)
-        for num_warps in [1, 2, 4, 8]
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
@@ -334,9 +331,8 @@ def parallel_nsa_compression_bwd_kernel_dkv(
 })
 @triton.autotune(
     configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8]
-        for num_stages in [2, 3, 4, 5]
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK'],
 )
@@ -443,8 +439,8 @@ def parallel_nsa_kernel_topk(
         b_i, b_ip = tl.sum(b_p, 0), b_i
         o_i, o_ip = tl.where(o_c <= i_t // BS, o_c + 1, 0), o_i
 
-        n_dims: core.constexpr = tl.standard._log2(b_i.shape[0])
-        for i in core.static_range(1, n_dims):
+        n_dims: tl.constexpr = tl.standard._log2(b_i.shape[0])
+        for i in tl.static_range(1, n_dims):
             b_i, o_i = _bitonic_merge(b_i, o_i.to(tl.int32), i, 2, n_dims)
 
         if i_c != 0:
@@ -469,7 +465,7 @@ def parallel_nsa_kernel_topk(
 @triton.autotune(
     configs=[
         triton.Config({}, num_warps=num_warps)
-        for num_warps in [1, 2, 4, 8]
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
@@ -613,9 +609,8 @@ def parallel_nsa_bwd_kernel_preprocess(
 })
 @triton.autotune(
     configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8]
-        for num_stages in [2, 3, 4, 5]
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
@@ -721,9 +716,8 @@ def parallel_nsa_bwd_kernel_dq(
 })
 @triton.autotune(
     configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4, 8]
-        for num_stages in [2, 3, 4, 5]
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4]
     ],
     key=['BS', 'BK', 'BV'],
 )
diff --git a/native_sparse_attention/ops/utils.py b/native_sparse_attention/ops/utils.py
@@ -55,11 +55,6 @@ def _bitonic_merge(
     order: core.constexpr,
     n_dims: core.constexpr,
 ):
-    '''
-    order_type 0 == ascending
-    order_type 1 == descending
-    order_type 2 == alternating
-    '''
     n_outer: core.constexpr = x.numel >> n_dims
     core.static_assert(stage <= n_dims)
     # flip denotes whether to re-arrange sub-sequences of elements in ascending or