Adds helper for top-k attention indices

LoserCheems · LoserCheems · commit 14a7f1c70610 · 2025-11-07T19:45:51.000+08:00
Introduces reusable top-k extraction on the bias tensor to simplify downstream mask logic.
diff --git a/flash_dmattn/utils/mask.py b/flash_dmattn/utils/mask.py
@@ -17,6 +17,33 @@
 import torch
 
 
+def topk_indices(
+    attention_bias: torch.Tensor,
+    window_size: int,
+    **kwargs,
+) -> torch.Tensor:
+    r"""
+    This function generates top-k indices based on the attention bias.
+
+    Args:
+        attention_bias (torch.Tensor): The attention bias tensor of
+            (batch_size, num_kv_heads, key_len).
+        window_size (int): The number of top elements to consider for the mask.
+        **kwargs: Additional keyword arguments.
+    
+    Returns:
+        topk_indices (Tensor): The top-k indices tensor of shape
+            (batch_size, num_kv_heads, window_size).
+    """
+    attention_bias = attention_bias.detach()
+    topk_indices = torch.topk(
+        attention_bias,
+        window_size, dim=-1, largest=True, sorted=False
+    ).indices
+    topk_indices = torch.sort(topk_indices, dim=-1).values
+    return topk_indices
+
+
 def dynamic_mask(
     attention_bias: torch.Tensor,
     attention_mask: Optional[torch.Tensor],