more setup for testing

lucidrains · lucidrains · commit 6606b67dbf73 · 2025-02-24T19:56:52.000Z
diff --git a/test_triton_nsa.py b/test_triton_nsa.py
@@ -2,7 +2,7 @@
 from native_sparse_attention_pytorch.triton_native_sparse_attention import native_sparse_attend
 
 import einx
-from einops import rearrange, einsum
+from einops import rearrange, einsum, repeat
 
 assert torch.cuda.is_available()
 
@@ -33,7 +33,8 @@ def regular_attend(
 
     # rest of the indices
 
-    has_sel_kv_blocks = indices.shape[-1] > 0
+    num_sel_kv_blocks = indices.shape[-1]
+    has_sel_kv_blocks = num_sel_kv_blocks > 0
 
     if has_sel_kv_blocks:
         bk, bv = tuple(rearrange(t, 'b (h w) n d -> b h w n d', h = kv_heads) for t in (k, v))
@@ -43,14 +44,33 @@ def regular_attend(
         q = rearrange(q, 'b (h w) n d -> b h (w n) d', h = q_heads)
         bsim = einsum(q, sel_bk, 'b h i d, b h i j d -> b h i j') * scale
 
+        bsim = rearrange(bsim, 'b h (w i) (sel j) -> b h w i sel j', sel = num_sel_kv_blocks, i = fine_block_size)
+
+        mask = rearrange(mask, 'b h (w i) sel -> b h w i sel', i = fine_block_size)
+        bsim = torch.where(mask[..., None], bsim, -torch.finfo(bsim.dtype).max)
+
+        sim = rearrange(sim, 'b (h w) i j -> b h w i 1 j', h = q_heads)
+
+        sim = torch.cat((sim, bsim), dim = -2)
+        sim = rearrange(sim, 'b h w i causal_and_sel j -> b h (w i) (causal_and_sel j)')
+
+        sel_bv = rearrange(sel_bv, 'b h (w i) j d -> b h w i j d', i = fine_block_size)
+
+        v = repeat(v, 'b (h w) j d -> b h w i j d', h = kv_heads, i = fine_block_size)
+        v = torch.cat((v, sel_bv), dim = -2)
+        v = rearrange(v, 'b h w i j d -> b h (w i) j d')
+
     # attend
 
     attn = sim.softmax(dim = -1)
 
-    out = einsum(attn, v, 'b h i j, b h j d -> b h i d')
+    if has_sel_kv_blocks:
+        out = einsum(attn, v, 'b h i j, b h i j d -> b h i d')
+    else:
+        out = einsum(attn, v, 'b h i j, b h j d -> b h i d')
 
-    if exists(block_size):
-        out = rearrange(out, 'b (h w) n d -> b h (w n) d', w = w)
+        if exists(block_size):
+            out = rearrange(out, 'b (h w) n d -> b h (w n) d', w = w)
 
     return out
 
@@ -62,8 +82,8 @@ def regular_attend(
 k = torch.randn(1, 2, 512, 64).cuda()
 v = torch.randn(1, 2, 512, 64).cuda()
 
-indices = torch.zeros(1, 2, 512, 1).long().cuda()
-mask = torch.zeros(1, 2, 512, 1).bool().cuda()
+indices = torch.zeros(1, 2, 512, 0).long().cuda()
+mask = torch.zeros(1, 2, 512, 0).bool().cuda()
 
 # both regular and nsa pathways `r` and `n`