fix importance score not normalized at inference

lucidrains · lucidrains · commit 922a6332abc4 · 2025-03-19T07:46:32.000-07:00
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -496,6 +496,8 @@ def forward_inference(
             if self.query_heads_share_selected_kv:
                 importance_scores = reduce(importance_scores, 'b (h grouped_queries) ... -> b h ...', 'mean', grouped_queries = self.num_grouped_queries)
 
+            importance_scores = importance_scores.softmax(dim = -1)
+
             sel_scores, sel_indices = importance_scores.topk(num_selected, dim = -1)
     
             fine_divisible_seq_len = round_up_mult(seq_len, self.selection_block_size)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.1.19"
+version = "0.1.20"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_sparse_attn.py b/tests/test_sparse_attn.py
@@ -51,7 +51,11 @@ def test_sparse_attn(
     assert tokens.shape == attended.shape
 
 @pytest.mark.parametrize('seq_len', (2, 8, 16))
-def test_inference(seq_len):
+@pytest.mark.parametrize('num_selected_blocks', (0, 2))
+def test_inference(
+    seq_len,
+    num_selected_blocks
+):
 
     attn = SparseAttention(
         dim = 512,
@@ -61,7 +65,7 @@ def test_inference(seq_len):
         sliding_window_size = 2,
         compress_block_size = 5,
         selection_block_size = 10,
-        num_selected_blocks = 0
+        num_selected_blocks = num_selected_blocks
     )
 
     tokens = torch.randn(2, seq_len, 512)