last commit for the day, should be ready for experiments tomorrow

lucidrains · lucidrains · commit 10bcfb9f9bdf · 2025-02-20T09:12:15.000-08:00
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -4,7 +4,7 @@
 from math import ceil
 
 import torch
-from torch import nn, arange, stack, cat
+from torch import nn, arange, stack, cat, Tensor
 import torch.nn.functional as F
 from torch.nn import Module, ModuleList
 
@@ -65,29 +65,32 @@ def compress_mask(_, __, q_idx, kv_idx):
     block_mask = create_block_mask(compress_mask, B = None, H = None, Q_LEN = seq_len, KV_LEN = kv_seq_len, _compile = True)
     return block_mask
 
+def create_fine_mask(seq_len, fine_block_size):
 
-def create_fine_mask(selected_block_indices: Tensor, seq_len, fine_block_size):
-    device = selected_block_indices.device
-    batch, heads = selected_block_indices.shape[:2]
+    def inner(selected_block_indices: Tensor):
+        device = selected_block_indices.device
+        batch, heads = selected_block_indices.shape[:2]
 
-    one_hot_selected_block_indices = torch.zeros((*selected_block_indices.shape[:-1], seq_len // fine_block_size), device = device, dtype = torch.bool)
-    one_hot_selected_block_indices.scatter_(-1, selected_block_indices, True)
+        one_hot_selected_block_indices = torch.zeros((*selected_block_indices.shape[:-1], seq_len // fine_block_size), device = device, dtype = torch.bool)
+        one_hot_selected_block_indices.scatter_(-1, selected_block_indices, True)
 
-    def fine_mask(b_idx, h_idx, q_idx, kv_idx):
+        def fine_mask(b_idx, h_idx, q_idx, kv_idx):
 
-        compressed_q_idx = q_idx // fine_block_size
-        compressed_kv_idx = kv_idx // fine_block_size
+            compressed_q_idx = q_idx // fine_block_size
+            compressed_kv_idx = kv_idx // fine_block_size
 
-        block_causal_mask = compressed_q_idx > compressed_kv_idx
-        is_selected = one_hot_selected_block_indices[b_idx, h_idx, q_idx, compressed_kv_idx]
+            block_causal_mask = compressed_q_idx > compressed_kv_idx
+            is_selected = one_hot_selected_block_indices[b_idx, h_idx, q_idx, compressed_kv_idx]
 
-        causal_mask = q_idx >= kv_idx
-        block_diagonal = compressed_q_idx == compressed_kv_idx
+            causal_mask = q_idx >= kv_idx
+            block_diagonal = compressed_q_idx == compressed_kv_idx
 
-        return (causal_mask & block_diagonal) | (block_causal_mask & is_selected)
+            return (causal_mask & block_diagonal) | (block_causal_mask & is_selected)
 
-    block_mask = create_block_mask(fine_mask, B = batch, H = heads, Q_LEN = seq_len, KV_LEN = seq_len, _compile = True)
-    return block_mask
+        block_mask = create_block_mask(fine_mask, B = batch, H = heads, Q_LEN = seq_len, KV_LEN = seq_len, _compile = True)
+        return block_mask
+
+    return inner
 
 # helpers
 
@@ -241,7 +244,8 @@ def __init__(
     def forward(
         self,
         inp,
-        sliding_window_flex_mask = None
+        sliding_window_flex_mask = None,
+        fine_selection_flex_mask = None
     ):
         batch, seq_len, scale, heads, device = *inp.shape[:2], self.scale, self.heads, inp.device
 
diff --git a/native_sparse_attention_pytorch/transformer.py b/native_sparse_attention_pytorch/transformer.py
@@ -10,8 +10,9 @@
 
 from native_sparse_attention_pytorch.native_sparse_attention import (
     SparseAttention,
-    create_sliding_mask,
     create_compress_mask,
+    create_fine_mask,
+    create_sliding_mask,
 )
 
 # flex attention
@@ -121,6 +122,7 @@ def __init__(
         ff_expansion_factor = 4.,
         use_sparse_attn = False,
         use_flex_sliding_window = False,
+        use_flex_fine_selection = False,
         sparse_attn_kwargs: dict = dict(
             sliding_window_size = 32,
             compress_block_size = 4,
@@ -131,11 +133,12 @@ def __init__(
         super().__init__()
         self.token_emb = nn.Embedding(num_tokens, dim)
 
-        if use_flex_sliding_window:
+        if use_flex_sliding_window or use_flex_fine_selection:
             assert exists(flex_attention), 'flex attention is not available on your current version of pytorch'
 
         self.use_sparse_attn = use_sparse_attn
-        self.use_flex_sliding_window = use_flex_sliding_window
+        self.use_flex_sliding_window = use_sparse_attn & use_flex_sliding_window
+        self.use_flex_fine_selection = use_sparse_attn & use_flex_fine_selection
 
         layers = []
         for _ in range(depth):
@@ -186,11 +189,16 @@ def forward(
 
         attn_kwargs = dict()
 
-        if not disable_flex and self.use_sparse_attn and self.use_flex_sliding_window:
+        if not disable_flex and self.use_flex_sliding_window:
             attn_kwargs.update(
                 sliding_window_flex_mask = create_sliding_mask(seq_len, self.attn_sliding_window_size)
             )
 
+        if not disable_flex and self.use_flex_fine_selection:
+            attn_kwargs.udpate(
+                fine_selection_flex_mask = create_fine_mask(seq_len, self.attn_fine_block_size)
+            )
+
         # layers
 
         for attn, ff in self.layers:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.24"
+version = "0.0.25"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/train.py b/train.py
@@ -24,6 +24,7 @@
 SEQ_LEN = 256
 
 USE_SPARSE_ATTN = True
+USE_FLEX_FOR_FINE_SELECTION = False # will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels
 
 # experiment related
 
@@ -97,6 +98,7 @@ def base_decoding(
     kv_heads = 4,
     use_sparse_attn = USE_SPARSE_ATTN,
     use_flex_sliding_window = True,
+    use_flex_fine_selection = USE_FLEX_FOR_FINE_SELECTION,
     sparse_attn_kwargs = dict(
         sliding_window_size = 32,
         compress_block_size = 32,