oops

lucidrains · lucidrains · commit f32ed38b611d · 2025-02-21T15:54:20.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -399,7 +399,7 @@ def forward(
 
             if self.use_diff_topk:
                 gates = straight_through(selected_importance_values, 1.)
-                gates = gates.cumsum(dim = -1)[..., -1]
+                gates = gates.cumprod(dim = -1)[..., -1]
                 gates = repeat(gates, 'b h ... -> b (h qh) ...', qh = self.num_grouped_queries)
 
             if exists(fine_selection_flex_mask):
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.39"
+version = "0.0.40"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/train.py b/train.py
@@ -116,7 +116,7 @@ def base_decoding(
         ),
         selection_block_size = 32,
         num_selected_blocks = 2,
-        use_diff_topk = False,
+        use_diff_topk = True,
         interpolated_importance_score = True
     )
 ).cuda()

Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ def base_decoding(`
`116`	`116`	`),`
`117`	`117`	`selection_block_size = 32,`
`118`	`118`	`num_selected_blocks = 2,`
`119`		`- use_diff_topk = False,`
	`119`	`+ use_diff_topk = True,`
`120`	`120`	`interpolated_importance_score = True`
`121`	`121`	`)`
`122`	`122`	`).cuda()`