ready to be compared with full attention.

lucidrains · lucidrains · commit e9476ecc5c99 · 2025-02-21T16:21:13.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -189,6 +189,7 @@ def __init__(
         norm = True,
         use_diff_topk = False,
         interpolated_importance_score = False,
+        query_heads_share_selected_kv = True, # if set to True, importance score is averaged across query heads to select top-n buckets of kv per kv head - but can be set to False for each query head within a group to look at different sets of kv buckets. will be more memory and compute of course
         compress_mlp: Module | None = None,
         compress_mlp_expand_factor = 1.,
         strategy_combine_mlp: Module | None = None
@@ -272,6 +273,8 @@ def __init__(
 
         self.interpolated_importance_score = interpolated_importance_score # in the case fine block size < compressed block size, will weigh space better when selecting
 
+        self.query_heads_share_selected_kv = query_heads_share_selected_kv
+
         self.selection_block_size = selection_block_size
 
         assert num_selected_blocks > 0
@@ -363,9 +366,14 @@ def forward(
 
         importance_scores = cattn[..., num_mem_compress_kv:]
 
-        # for gqa, we will average the compressed attention across each grouped queries (per key / values)
+        # maybe average the compressed attention across each grouped queries (per key / values)
+
+        if self.query_heads_share_selected_kv:
+            importance_scores = reduce(importance_scores, 'b (h grouped_queries) ... -> b h ...', 'mean', grouped_queries = self.num_grouped_queries)
 
-        importance_scores = reduce(importance_scores, 'b (h grouped_queries) ... -> b h ...', 'mean', grouped_queries = self.num_grouped_queries)
+            fine_num_grouped_queries = self.num_grouped_queries
+        else:
+            fine_num_grouped_queries = 1
 
         # handle if compress block size does not equal to the fine block size
         # cannot parse their equation, so will just improvise
@@ -400,12 +408,12 @@ def forward(
             if self.use_diff_topk:
                 gates = straight_through(selected_importance_values, 1.)
                 gates = gates.cumprod(dim = -1)[..., -1]
-                gates = repeat(gates, 'b h ... -> b (h qh) ...', qh = self.num_grouped_queries)
+                gates = repeat(gates, 'b h ... -> b (h qh) ...', qh = fine_num_grouped_queries)
 
             if exists(fine_selection_flex_mask):
                 # flex attention for the selection for fine attention
 
-                fine_block_mask = fine_selection_flex_mask(selected_block_indices, num_grouped_queries = self.num_grouped_queries)
+                fine_block_mask = fine_selection_flex_mask(selected_block_indices, num_grouped_queries = fine_num_grouped_queries)
 
                 fine_attn_out = flex_attention(fq, fk, fv, block_mask = fine_block_mask, enable_gqa = True)
 
@@ -428,13 +436,13 @@ def forward(
                 # handle block causal diagonal in the diagram, but run experiments without to see
 
                 fine_window_seq = arange(fine_divisible_seq_len, device = device) // self.selection_block_size
-                fine_window_seq = repeat(fine_window_seq, 'n -> b h n 1', b = batch, h = self.kv_heads)
+                fine_window_seq = repeat(fine_window_seq, 'n -> b h n 1', b = batch, h = selected_block_indices.shape[1])
                 selected_block_indices = cat((selected_block_indices, fine_window_seq), dim = -1) # for the block causal diagonal in fig2
 
                 fmask = repeat(fmask, 'b h i w -> b h i w j', j = self.selection_block_size)
 
                 causal_mask = torch.ones((self.selection_block_size,) * 2, device = device, dtype = torch.bool).tril()
-                causal_mask = repeat(causal_mask, 'i j -> b h (w i) 1 j', w = num_fine_blocks, b = batch, h = self.kv_heads)
+                causal_mask = repeat(causal_mask, 'i j -> b h (w i) 1 j', w = num_fine_blocks, b = batch, h = fmask.shape[1])
 
                 fmask = cat((fmask, causal_mask), dim = -2)
                 fmask = rearrange(fmask, 'b h i w j -> b h i (w j)')
@@ -446,8 +454,12 @@ def forward(
 
                 # get_at("b h [w] j d, b h i selected -> b h i selected j d", fkv, selected_block_indices)
 
-                fk = repeat(fk, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
-                fv = repeat(fv, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
+                if self.query_heads_share_selected_kv:
+                    fk = repeat(fk, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
+                    fv = repeat(fv, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
+                else:
+                    fk = repeat(fk, 'b h w j d -> b (h qh) i w j d', i = selected_block_indices.shape[2], qh = self.num_grouped_queries)
+                    fv = repeat(fv, 'b h w j d -> b (h qh) i w j d', i = selected_block_indices.shape[2], qh = self.num_grouped_queries)
 
                 selected_block_indices = repeat(selected_block_indices, 'b h i sel -> b h i sel j d', j = fk.shape[-2], d = fk.shape[-1])
 
@@ -460,7 +472,7 @@ def forward(
 
                 fmask = rearrange(fmask, 'b h ... -> b h 1 ...')
 
-                fq = rearrange(fq, 'b (h qh) ... -> b h qh ...', qh = self.num_grouped_queries)
+                fq = rearrange(fq, 'b (h qh) ... -> b h qh ...', qh = fine_num_grouped_queries)
 
                 fsim = einsum(fq, fk, 'b h qh i d, b h i j d -> b h qh i j') * self.scale
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.40"
+version = "0.0.41"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_sparse_attn.py b/tests/test_sparse_attn.py
@@ -10,11 +10,13 @@
 @pytest.mark.parametrize('seq_len', (1, 4, 31, 32, 120))
 @pytest.mark.parametrize('kv_heads', (8, 4))
 @pytest.mark.parametrize('selection_block_size', (8, 4, 2))
+@pytest.mark.parametrize('query_heads_share_selected_kv', (False, True))
 def test_sparse_attn(
     use_diff_topk,
     seq_len,
     kv_heads,
-    selection_block_size
+    selection_block_size,
+    query_heads_share_selected_kv
 ):
     attn = SparseAttention(
         dim = 512,
@@ -25,7 +27,8 @@ def test_sparse_attn(
         compress_block_size = 4,
         selection_block_size = selection_block_size,
         num_selected_blocks = 2,
-        use_diff_topk = use_diff_topk
+        use_diff_topk = use_diff_topk,
+        query_heads_share_selected_kv = query_heads_share_selected_kv
     )
 
     tokens = torch.randn(2, seq_len, 512)
diff --git a/train.py b/train.py
@@ -30,7 +30,8 @@
 SEQ_LEN = 256
 
 USE_SPARSE_ATTN = True
-USE_FLEX_FOR_FINE_SELECTION = True # will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels
+USE_FLEX_FOR_FINE_SELECTION = True   # will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels
+QUERY_HEADS_SHARE_SELECTION = False  # if set to False, each query head can look at a different segment of their corresponding key / value head in GQA
 
 # experiment related
 
@@ -117,7 +118,8 @@ def base_decoding(
         selection_block_size = 32,
         num_selected_blocks = 2,
         use_diff_topk = True,
-        interpolated_importance_score = True
+        interpolated_importance_score = True,
+        query_heads_share_selected_kv = QUERY_HEADS_SHARE_SELECTION
     )
 ).cuda()