prepare for gqa with nsa

lucidrains · lucidrains · commit dbcf080fc960 · 2025-02-26T13:13:14.000Z
diff --git a/native_sparse_attention_pytorch/__init__.py b/native_sparse_attention_pytorch/__init__.py
@@ -1,3 +1,7 @@
 from native_sparse_attention_pytorch.native_sparse_attention import (
     SparseAttention
 )
+
+from native_sparse_attention_pytorch.triton_native_sparse_attention import (
+    native_sparse_attend
+)
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -451,8 +451,7 @@ def forward(
                     fq, fk, fv,
                     self.selection_block_size,
                     selected_block_indices,
-                    fmask,
-                    fine_num_grouped_queries
+                    fmask
                 )
 
             elif exists(fine_selection_flex_mask):
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -8,7 +8,7 @@
 import torch
 from torch import Tensor
 
-from einops import repeat, rearrange
+from einops import repeat, rearrange, reduce
 
 def exists(v):
     return v is not None
@@ -1044,12 +1044,12 @@ def flash_attn_backward(
         dq_accum.stride(0),
         dq_accum.stride(1),
         dq_accum.stride(2),
-        dk.stride(0),
-        dk.stride(1),
-        dk.stride(2),
-        dv.stride(0),
-        dv.stride(1),
-        dv.stride(2),
+        dk_accum.stride(0),
+        dk_accum.stride(1),
+        dk_accum.stride(2),
+        dv_accum.stride(0),
+        dv_accum.stride(1),
+        dv_accum.stride(2),
         kv_block_indices.stride(0),
         kv_block_indices.stride(1),
         kv_block_indices.stride(2),
@@ -1094,10 +1094,15 @@ def forward(
         block_size,
         selected_block_indices,
         fmask,
-        num_grouped_queries
     ):
         dtype = fq.dtype
 
+        q_heads, kv_heads = fq.shape[1], fk.shape[1]
+        assert divisible_by(q_heads, kv_heads)
+        head_groups = q_heads // kv_heads
+
+        fk, fv, selected_block_indices, fmask = tuple(repeat(t, 'b h ... -> b (h g) ...', g = head_groups) for t in (fk, fv, selected_block_indices, fmask))
+
         fq, fk, fv = tuple(t.half() for t in (fq, fk, fv))
 
         out, lse = flash_attn_forward(
@@ -1108,23 +1113,29 @@ def forward(
         )
 
         ctx.save_for_backward(fq, fk, fv, selected_block_indices, fmask, out, lse)
-        ctx._saved_variables = (block_size,)
+
+        ctx._saved_variables = (
+            block_size,
+            head_groups
+        )
 
         return out.type(dtype)
 
     @classmethod
     def backward(self, ctx, do):
+        device = do.device
 
         q, k, v, sel_block_indices, mask, out, lse = ctx.saved_tensors
 
         (
             block_size,
+            head_groups
         ) = ctx._saved_variables
 
         do = do.half()
-        dq = torch.zeros_like(q)
-        dk = torch.zeros_like(k)
-        dv = torch.zeros_like(v)
+        dq = torch.zeros(q.shape, dtype = torch.float32, device = device)
+        dk = torch.zeros(k.shape, dtype = torch.float32, device = device)
+        dv = torch.zeros(v.shape, dtype = torch.float32, device = device)
 
         flash_attn_backward(
             do, q, k, v,
@@ -1133,6 +1144,8 @@ def backward(self, ctx, do):
             block_size = block_size
         )
 
+        dk, dv = tuple(reduce(t, 'b (h g) ... -> b h ...', 'sum', g = head_groups) for t in (dk, dv))
+
         return dq, dk, dv, None, None, None, None
 
 native_sparse_attend = NSA.apply
diff --git a/test_triton_nsa.py b/test_triton_nsa.py
@@ -9,26 +9,34 @@
 def exists(v):
     return v is not None
 
+def divisible_by(num, den):
+    return (num % den) == 0
+
 def regular_attend(
     q, k, v,
     indices,
     mask,
-    block_size = None,
+    block_size,
 ):
-    q_heads, kv_heads = q.shape[1], k.shape[1]
+    q_heads, seq_len, kv_heads, device = q.shape[1], q.shape[-2], k.shape[1], q.device
+    assert divisible_by(q_heads, kv_heads)
+
+    g = q_heads // kv_heads # `g` stands for `g`roups of query heads per kv head
+
+    assert divisible_by(seq_len, block_size)
+    w = seq_len // block_size
 
-    if exists(block_size):
-        w = q.shape[-2] // block_size
-        q, k, v = tuple(rearrange(t, 'b h (w n) d -> b (h w) n d', n = block_size) for t in (q, k, v))
+    q, k, v = tuple(rearrange(t, 'b h (w n) d -> b h w n d', n = block_size) for t in (q, k, v))
 
-    seq_len, device = q.shape[-2], q.device
     scale = q.shape[-1] ** -0.5
     q = q * scale
 
+    q = rearrange(q, 'b (h g) ... -> b h g ...', g = g)
+
     # block causal diagonal
 
-    sim = einsum(q, k, 'b h i d, b h j d -> b h i j')
-    causal_mask = torch.ones((seq_len, seq_len), device = device, dtype = torch.bool).triu(1)
+    sim = einsum(q, k, 'b h g w i d, b h w j d -> b h g w i j')
+    causal_mask = torch.ones((block_size, block_size), device = device, dtype = torch.bool).triu(1)
     sim = sim.masked_fill(causal_mask, -torch.finfo(sim.dtype).max)
 
     # rest of the indices
@@ -37,48 +45,45 @@ def regular_attend(
     has_sel_kv_blocks = num_sel_kv_blocks > 0
 
     if has_sel_kv_blocks:
-        bk, bv = tuple(rearrange(t, 'b (h w) n d -> b h w n d', h = kv_heads) for t in (k, v))
+        bk, bv = k, v
         sel_bk = einx.get_at('b h [w] n d, b h i sel -> b h i (sel n) d', bk, indices)
         sel_bv = einx.get_at('b h [w] n d, b h i sel -> b h i (sel n) d', bv, indices)
 
-        q = rearrange(q, 'b (h w) n d -> b h (w n) d', h = q_heads)
-        bsim = einsum(q, sel_bk, 'b h i d, b h i j d -> b h i j')
+        q = rearrange(q, 'b h g w n d -> b h g (w n) d')
+        bsim = einsum(q, sel_bk, 'b h g i d, b h i j d -> b h g i j')
 
-        bsim = rearrange(bsim, 'b h (w i) (sel j) -> b h w i sel j', sel = num_sel_kv_blocks, i = fine_block_size)
+        bsim = rearrange(bsim, 'b h g (w i) (sel j) -> b h g w i sel j', sel = num_sel_kv_blocks, i = fine_block_size)
 
-        mask = rearrange(mask, 'b h (w i) sel -> b h w i sel', i = fine_block_size)
+        mask = rearrange(mask, 'b h (w i) sel -> b h 1 w i sel', i = fine_block_size)
         bsim = torch.where(mask[..., None], bsim, -torch.finfo(bsim.dtype).max)
 
-        sim = rearrange(sim, 'b (h w) i j -> b h w i 1 j', h = q_heads)
+        sim = rearrange(sim, 'b h g w i j -> b h g w i 1 j')
 
         sim = torch.cat((sim, bsim), dim = -2)
-        sim = rearrange(sim, 'b h w i causal_and_sel j -> b h (w i) (causal_and_sel j)')
+        sim = rearrange(sim, 'b h g w i causal_and_sel j -> b h g w i (causal_and_sel j)')
 
         sel_bv = rearrange(sel_bv, 'b h (w i) j d -> b h w i j d', i = fine_block_size)
 
-        v = repeat(v, 'b (h w) j d -> b h w i j d', h = kv_heads, i = fine_block_size)
+        v = repeat(v, 'b h w j d -> b h w i j d', i = fine_block_size)
         v = torch.cat((v, sel_bv), dim = -2)
-        v = rearrange(v, 'b h w i j d -> b h (w i) j d')
+        v = rearrange(v, 'b h w i j d -> b h w i j d')
 
     # attend
 
     attn = sim.softmax(dim = -1)
 
     if has_sel_kv_blocks:
-        out = einsum(attn, v, 'b h i j, b h i j d -> b h i d')
+        out = einsum(attn, v, 'b h g w i j, b h w i j d -> b h g w i d')
     else:
-        out = einsum(attn, v, 'b h i j, b h j d -> b h i d')
-
-        if exists(block_size):
-            out = rearrange(out, 'b (h w) n d -> b h (w n) d', w = w)
+        out = einsum(attn, v, 'b h g w i j, b h j d -> b h g w i d')
 
-    return out
+    return rearrange(out, 'b h g w n d -> b (h g) (w n) d')
 
 # mock inputs
 
 fine_block_size = 16
 
-q = torch.randn(1, 2, 512, 64).cuda()
+q = torch.randn(1, 4, 512, 64).cuda()
 k = torch.randn(1, 2, 512, 64).cuda()
 v = torch.randn(1, 2, 512, 64).cuda()
 
@@ -97,7 +102,7 @@ def regular_attend(
 
 # triton nsa forwards and backwards
 
-nsa_out = native_sparse_attend(nq, nk, nv, fine_block_size, indices, mask, 1)
+nsa_out = native_sparse_attend(nq, nk, nv, fine_block_size, indices, mask)
 nsa_out.sum().backward()
 
 # asserts

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,7 @@`
`1`	`1`	`from native_sparse_attention_pytorch.native_sparse_attention import (`
`2`	`2`	`SparseAttention`
`3`	`3`	`)`
	`4`	`+`
	`5`	`+from native_sparse_attention_pytorch.triton_native_sparse_attention import (`
	`6`	`+ native_sparse_attend`
	`7`	`+)`
Original file line number	Diff line number	Diff line change
`@@ -451,8 +451,7 @@ def forward(`
`451`	`451`	`fq, fk, fv,`
`452`	`452`	`self.selection_block_size,`
`453`	`453`	`selected_block_indices,`
`454`		`- fmask,`
`455`		`- fine_num_grouped_queries`
	`454`	`+ fmask`
`456`	`455`	`)`
`457`	`456`
`458`	`457`	`elif exists(fine_selection_flex_mask):`