fix some padding issues for gating with importance score

lucidrains · lucidrains · commit 1f11855dbb16 · 2025-03-01T09:53:02.000-08:00
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -657,6 +657,9 @@ def forward(
 
                     selected_block_indices = pad_at_dim(selected_block_indices, (0, remainder), value = 0, dim = -2)
 
+                    if exists(gates):
+                        gates = pad_at_dim(gates, (0, remainder), value = 0, dim = -2)
+
                 # handle block causal diagonal in the diagram, but run experiments without to see
 
                 fine_window_seq = arange(fine_divisible_seq_len, device = device) // self.selection_block_size
@@ -693,6 +696,7 @@ def forward(
                 # differential topk gating
 
                 if self.use_diff_topk:
+                    gates = F.pad(gates, (0, 1), value = 1.)
                     fk = einx.multiply('b h i sel, b h i sel j d -> b h i sel j d', gates, fk)
 
                 # merge selected key values