feedback from researchers have been positive about tensor typing, so will use it here

lucidrains · lucidrains · commit 4ae5c1127987 · 2025-02-27T18:10:06.000Z
diff --git a/native_sparse_attention_pytorch/tensor_typing.py b/native_sparse_attention_pytorch/tensor_typing.py
@@ -0,0 +1,26 @@
+from torch import Tensor
+
+from jaxtyping import (
+    Float,
+    Int,
+    Bool
+)
+
+# jaxtyping is a misnomer, works for pytorch
+
+class TorchTyping:
+    def __init__(self, abstract_dtype):
+        self.abstract_dtype = abstract_dtype
+
+    def __getitem__(self, shapes: str):
+        return self.abstract_dtype[Tensor, shapes]
+
+Float = TorchTyping(Float)
+Int   = TorchTyping(Int)
+Bool  = TorchTyping(Bool)
+
+__all__ = [
+    Float,
+    Int,
+    Bool
+]
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+from native_sparse_attention_pytorch.tensor_typing import Float, Int, Bool
 
 # taken from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_triton.py
 # with fixes for triton 2.3
@@ -1462,11 +1463,22 @@ def backward(self, ctx, do, _):
 
 _native_sparse_attend = NSA.apply
 
+# ein notation
+
+# b - batch
+# qh - query heads
+# kh - key / value heads
+# n - token sequence
+# d - attention head dimension
+# sel - selected indices
+
 def native_sparse_attend(
-    fq, fk, fv,
-    block_size,
-    selected_block_indices,
-    fmask,
+    fq: Float['b qh n d'],
+    fk: Float['b kh n d'],
+    fv: Float['b kh n d'],
+    block_size: int,
+    selected_block_indices: Int['b qh sel'] | Int['b kh sel'],
+    fmask: Bool['b qh sel'] | Bool['b kh sel'],
     return_lse = False
 ):
     seq_len = fq.shape[-2]
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ classifiers=[
 dependencies = [
     "einx>=0.3.0",
     "einops>=0.8.1",
+    "jaxtyping",
     "local-attention>=1.11.1",
     "rotary-embedding-torch",
     "torch>=2.5",