expand key / value by the head group size for now, when query_heads_share_selection is turned on

lucidrains · lucidrains · commit 446a033f70c6 · 2025-02-27T14:54:24.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -443,6 +443,7 @@ def forward(
                 gates = repeat(gates, 'b h ... -> b (h qh) ...', qh = fine_num_grouped_queries)
 
             if self.use_triton_kernel and not disable_triton_kernel:
+
                 from native_sparse_attention_pytorch.triton_native_sparse_attention import native_sparse_attend
 
                 fmask = selected_importance_values > 1e-10
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -578,7 +578,6 @@ def backward_kernel_one_col_block(
     seqlen_k,
     seqlen_q_rounded,
     headdim,
-    ATOMIC_ADD: tl.constexpr,
     BLOCK_HEADDIM: tl.constexpr,
     EVEN_M: tl.constexpr,
     EVEN_N: tl.constexpr,
@@ -1053,7 +1052,6 @@ def backward_kernel(
             seqlen_k,
             seqlen_q_rounded,
             headdim,
-            ATOMIC_ADD = False,
             BLOCK_HEADDIM = BLOCK_HEADDIM,
             EVEN_M = EVEN_M,
             EVEN_N = EVEN_N,
@@ -1263,6 +1261,15 @@ def native_sparse_attend(
     return_lse = False
 ):
     seq_len = fq.shape[-2]
+    q_heads, kv_heads, sel_heads = fq.shape[1], fk.shape[1], selected_block_indices.shape[1]
+
+    assert divisible_by(q_heads, kv_heads)
+    assert sel_heads in (q_heads, kv_heads)
+
+    # query heads within each group to attend to different segments
+
+    if kv_heads != sel_heads:
+        fk, fv = tuple(repeat(t, 'b h ... -> b (h gh) ...', gh = q_heads // kv_heads) for t in (fk, fv))
 
     out, lse = _native_sparse_attend(
         fq, fk, fv,
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.52"
+version = "0.0.53"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/test_triton_nsa.py b/test_triton_nsa.py
@@ -27,7 +27,6 @@ def regular_attend(
     assert divisible_by(q_heads, kv_heads)
 
     q, k, v = tuple(pad_to_multiple(t, block_size, dim = -2) for t in (q, k, v))
-    indices, mask = tuple(pad_to_multiple(t, block_size, dim = -2) for t in (indices, mask))
 
     g = q_heads // kv_heads # `g` stands for `g`roups of query heads per kv head
 
@@ -52,6 +51,8 @@ def regular_attend(
     has_sel_kv_blocks = num_sel_kv_blocks > 0
 
     if has_sel_kv_blocks:
+        indices, mask = tuple(pad_to_multiple(t, block_size, dim = -2) for t in (indices, mask))
+
         bk, bv = k, v
         sel_bk = einx.get_at('b h [w] n d, b h i sel -> b h i (sel n) d', bk, indices)
         sel_bv = einx.get_at('b h [w] n d, b h i sel -> b h i (sel n) d', bv, indices)
@@ -99,18 +100,19 @@ def regular_attend(
 
 # mock inputs
 
+batch = 2
 seq_len = 511
 q_heads = 4
-kv_heads = 2
+kv_heads = 4
 fine_block_size = 16
-num_sel = 1
+num_sel = 2
 
-q = torch.randn(2, q_heads, seq_len, 64).cuda()
-k = torch.randn(2, kv_heads, seq_len, 64).cuda()
-v = torch.randn(2, kv_heads, seq_len, 64).cuda()
+q = torch.randn(batch, q_heads, seq_len, 64).cuda()
+k = torch.randn(batch, kv_heads, seq_len, 64).cuda()
+v = torch.randn(batch, kv_heads, seq_len, 64).cuda()
 
-indices = torch.zeros(2, kv_heads, seq_len, num_sel).long().cuda()
-mask = torch.randint(0, 2, (2, kv_heads, seq_len, num_sel)).bool().cuda()
+indices = torch.zeros(batch, kv_heads, seq_len, num_sel).long().cuda()
+mask = torch.randint(0, 2, (batch, kv_heads, seq_len, num_sel)).bool().cuda()
 
 # both regular and nsa pathways `r` and `n`
 
diff --git a/train.py b/train.py
@@ -34,7 +34,7 @@
 USE_SPARSE_ATTN = True
 USE_TRITON_NSA = True
 USE_FLEX_FOR_FINE_SELECTION = False   # will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels
-QUERY_HEADS_SHARE_SELECTION = False  # if set to False, each query head can look at a different segment of their corresponding key / value head in GQA
+QUERY_HEADS_SHARE_SELECTION = True    # if set to False, each query head can look at a different segment of their corresponding key / value head in GQA
 
 # sparse attention related
 
@@ -99,7 +99,11 @@ def base_decoding(
     sample_num_times = max(0, seq_len - prompt_seq_len)
 
     for _ in tqdm(range(sample_num_times)):
-        logits = net(out, disable_flex = True)
+        logits = net(
+            out,
+            disable_flex = True,
+            disable_triton_kernel = True
+        )
 
         logits = logits[:, -1]
         logits = top_k(logits, thres = filter_thres)