prepare test file for fusing sliding window with NSA

lucidrains · lucidrains · commit 2c3ad83d1d0a · 2025-03-07T17:37:37.000Z
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -640,9 +640,16 @@ def backward_preprocess_do_o_dot(
     # load
 
     o = tl.load(
-        Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :],
-        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-        other=0.0,
+        Out +
+        off_b * stride_ob +
+        off_h * stride_oh +
+        offs_m[:, None] * stride_om +
+        offs_d[None, :],
+        mask = (
+            (offs_m[:, None] < seqlen_q) &
+            (offs_d[None, :] < headdim)
+        ),
+        other = 0.0,
     ).to(tl.float32)
 
     do = tl.load(
@@ -651,7 +658,10 @@ def backward_preprocess_do_o_dot(
         + off_h * stride_doh
         + offs_m[:, None] * stride_dom
         + offs_d[None, :],
-        mask = (offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+        mask = (
+            offs_m[:, None] < seqlen_q) &
+            (offs_d[None, :] < headdim
+        ),
         other = 0.0,
     ).to(tl.float32)
 
@@ -1189,8 +1199,8 @@ def backward_kernel_one_col_block_causal(
         # [2022-11-01] TD: Triton bug, there's a race condition if we just use m_mask and not d_mask.
         do = tl.load(
             do_ptrs,
-            mask=(offs_m[None, :, None] < seqlen_q) & (offs_d[None, None, :] < headdim),
-            other=0.0,
+            mask = (offs_m[None, :, None] < seqlen_q) & (offs_d[None, None, :] < headdim),
+            other = 0.0,
         )
 
     do = do.reshape(QUERY_HEAD_GROUPS * BLOCK, BLOCK_HEADDIM)
diff --git a/test_triton_nsa.py b/test_triton_nsa.py
@@ -1,6 +1,16 @@
 from math import ceil
 import torch
-from native_sparse_attention_pytorch.triton_native_sparse_attention import native_sparse_attend, round_up_multiple, pad_to_multiple
+
+from native_sparse_attention_pytorch.native_sparse_attention import (
+    create_sliding_mask,
+    flex_attention
+)
+
+from native_sparse_attention_pytorch.triton_native_sparse_attention import (
+    native_sparse_attend,
+    round_up_multiple,
+    pad_to_multiple,
+)
 
 import einx
 from einops import rearrange, einsum, repeat
@@ -10,6 +20,9 @@
 def exists(v):
     return v is not None
 
+def default(v, d):
+    return v if exists(v) else d
+
 def abs_diff(x, y):
     return (x - y).abs().amax()
 
@@ -21,12 +34,22 @@ def regular_attend(
     indices,
     mask,
     block_size,
+    sliding_window_size = None,
     sel_scale = None, 
-    return_lse = False
+    return_lse = False,
+    return_sliding_window_out = False
 ):
     q_heads, seq_len, kv_heads, device = q.shape[1], q.shape[-2], k.shape[1], q.device
     assert divisible_by(q_heads, kv_heads)
 
+    if return_sliding_window_out:
+        kv_seq_len = k.shape[-2]
+        assert seq_len == kv_seq_len
+
+        sliding_window_size = default(sliding_window_size, block_size)
+        sliding_mask = create_sliding_mask(kv_seq_len, sliding_window_size)
+        sliding_out = flex_attention(q, k, v, block_mask = sliding_mask, enable_gqa = True)
+
     q, k, v = tuple(pad_to_multiple(t, block_size, dim = -2) for t in (q, k, v))
 
     if exists(sel_scale):
@@ -97,6 +120,9 @@ def regular_attend(
 
     out = out[..., :seq_len, :]
 
+    if return_sliding_window_out:
+        out = (out, sliding_out)
+
     if not return_lse:
         return out
 
@@ -114,6 +140,7 @@ def regular_attend(
 kv_heads = 2
 fine_block_size = 16
 num_sel = 6
+fused_sliding_window = False
 
 q = torch.randn(batch, q_heads, seq_len, 64).cuda()
 k = torch.randn(batch, kv_heads, seq_len, 64).cuda()
@@ -130,7 +157,11 @@ def regular_attend(
 
 # regular forwards and backwards
 
-out, rlse = regular_attend(rq, rk, rv, indices, mask, block_size = fine_block_size, sel_scale = rsel_scale, return_lse = True)
+out, rlse = regular_attend(rq, rk, rv, indices, mask, block_size = fine_block_size, sel_scale = rsel_scale, return_lse = True, return_sliding_window_out = fused_sliding_window)
+
+if fused_sliding_window:
+    out = sum(out)
+
 out.sum().backward()
 
 # triton nsa forwards and backwards