wire up flex fine selected attention and make sure it runs

lucidrains · lucidrains · commit 6ee9078a7a66 · 2025-02-21T13:12:33.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -79,13 +79,12 @@ def fine_mask(b_idx, h_idx, q_idx, kv_idx):
             compressed_q_idx = q_idx // fine_block_size
             compressed_kv_idx = kv_idx // fine_block_size
 
-            block_causal_mask = compressed_q_idx > compressed_kv_idx
             is_selected = one_hot_selected_block_indices[b_idx, h_idx, q_idx, compressed_kv_idx]
 
             causal_mask = q_idx >= kv_idx
             block_diagonal = compressed_q_idx == compressed_kv_idx
 
-            return (causal_mask & block_diagonal) | (block_causal_mask & is_selected)
+            return (causal_mask & (block_diagonal | is_selected))
 
         block_mask = create_block_mask(fine_mask, B = batch, H = heads, Q_LEN = seq_len, KV_LEN = seq_len, _compile = True)
         return block_mask
@@ -344,76 +343,87 @@ def forward(
             selected_importance_values, selected_block_indices = importance_scores.topk(num_selected, dim = -1)
 
             if self.use_diff_topk:
+                assert not exists(fine_selection_flex_mask)
                 gates = straight_through(selected_importance_values, 1.)
 
-            fmask = selected_importance_values > 1e-10
+            if exists(fine_selection_flex_mask):
+                # flex attention for the selection for fine attention
 
-            if seq_len < fine_divisible_seq_len:
-                remainder = fine_divisible_seq_len - seq_len
-                fk = pad_at_dim(fk, (0, remainder), value = 0., dim = -2)
-                fv = pad_at_dim(fv, (0, remainder), value = 0., dim = -2)
-                fq = pad_at_dim(fq, (0, remainder), value = 0., dim = -2)
+                fk, fv, selected_block_indices = tuple(repeat(t, 'b h ... -> b (num_grouped_queries h) ...', num_grouped_queries = self.num_grouped_queries) for t in (fk, fv, selected_block_indices))
 
-                fmask = pad_at_dim(fmask, (0, remainder), value = False, dim = -2)
+                fine_block_mask = fine_selection_flex_mask(selected_block_indices)
 
-                selected_block_indices = pad_at_dim(selected_block_indices, (0, remainder), value = 0, dim = -2)
+                fine_attn_out = flex_attention(fq, fk, fv, block_mask = fine_block_mask)
 
-                if self.use_diff_topk:
-                    gates = pad_at_dim(gates, (0, remainder), value = 1., dim = -2)
+            else:
+                fmask = selected_importance_values > 1e-10
 
-            # handle block causal diagonal in the diagram, but run experiments without to see
+                if seq_len < fine_divisible_seq_len:
+                    remainder = fine_divisible_seq_len - seq_len
+                    fk = pad_at_dim(fk, (0, remainder), value = 0., dim = -2)
+                    fv = pad_at_dim(fv, (0, remainder), value = 0., dim = -2)
+                    fq = pad_at_dim(fq, (0, remainder), value = 0., dim = -2)
 
-            fine_window_seq = arange(fine_divisible_seq_len, device = device) // self.selection_block_size
-            fine_window_seq = repeat(fine_window_seq, 'n -> b h n 1', b = batch, h = self.kv_heads)
-            selected_block_indices = cat((selected_block_indices, fine_window_seq), dim = -1) # for the block causal diagonal in fig2
+                    fmask = pad_at_dim(fmask, (0, remainder), value = False, dim = -2)
 
-            fmask = repeat(fmask, 'b h i w -> b h i w j', j = self.selection_block_size)
+                    selected_block_indices = pad_at_dim(selected_block_indices, (0, remainder), value = 0, dim = -2)
 
-            causal_mask = torch.ones((self.selection_block_size,) * 2, device = device, dtype = torch.bool).tril()
-            causal_mask = repeat(causal_mask, 'i j -> b h (w i) 1 j', w = num_fine_blocks, b = batch, h = self.kv_heads)
+                    if self.use_diff_topk:
+                        gates = pad_at_dim(gates, (0, remainder), value = 1., dim = -2)
 
-            fmask = cat((fmask, causal_mask), dim = -2)
-            fmask = rearrange(fmask, 'b h i w j -> b h i (w j)')
+                # handle block causal diagonal in the diagram, but run experiments without to see
 
-            # select out the spatial crops of keys / values for fine attention
+                fine_window_seq = arange(fine_divisible_seq_len, device = device) // self.selection_block_size
+                fine_window_seq = repeat(fine_window_seq, 'n -> b h n 1', b = batch, h = self.kv_heads)
+                selected_block_indices = cat((selected_block_indices, fine_window_seq), dim = -1) # for the block causal diagonal in fig2
 
-            fk = rearrange(fk, 'b h (w n) d -> b h w n d', w = num_fine_blocks)
-            fv = rearrange(fv, 'b h (w n) d -> b h w n d', w = num_fine_blocks)
+                fmask = repeat(fmask, 'b h i w -> b h i w j', j = self.selection_block_size)
 
-            # get_at("b h [w] j d, b h i selected -> b h i selected j d", fkv, selected_block_indices)
+                causal_mask = torch.ones((self.selection_block_size,) * 2, device = device, dtype = torch.bool).tril()
+                causal_mask = repeat(causal_mask, 'i j -> b h (w i) 1 j', w = num_fine_blocks, b = batch, h = self.kv_heads)
 
-            fk = repeat(fk, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
-            fv = repeat(fv, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
+                fmask = cat((fmask, causal_mask), dim = -2)
+                fmask = rearrange(fmask, 'b h i w j -> b h i (w j)')
 
-            selected_block_indices = repeat(selected_block_indices, 'b h i sel -> b h i sel j d', j = fk.shape[-2], d = fk.shape[-1])
+                # select out the spatial crops of keys / values for fine attention
 
-            fk = fk.gather(3, selected_block_indices)
-            fv = fv.gather(3, selected_block_indices)
+                fk = rearrange(fk, 'b h (w n) d -> b h w n d', w = num_fine_blocks)
+                fv = rearrange(fv, 'b h (w n) d -> b h w n d', w = num_fine_blocks)
 
-            # handle maybe gating
+                # get_at("b h [w] j d, b h i selected -> b h i selected j d", fkv, selected_block_indices)
 
-            if self.use_diff_topk:
-                gates = F.pad(gates, (0, 1), value = 1.)
+                fk = repeat(fk, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
+                fv = repeat(fv, 'b h w j d -> b h i w j d', i = selected_block_indices.shape[2])
 
-                fk = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fk)
-                fv = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fv)
+                selected_block_indices = repeat(selected_block_indices, 'b h i sel -> b h i sel j d', j = fk.shape[-2], d = fk.shape[-1])
 
-            fk = rearrange(fk, 'b h i w j d -> b h i (w j) d')
-            fv = rearrange(fv, 'b h i w j d -> b h i (w j) d')
+                fk = fk.gather(3, selected_block_indices)
+                fv = fv.gather(3, selected_block_indices)
 
-            # fine attention
+                # handle maybe gating
 
-            fk, fv, fmask = tuple(repeat(t, 'b h ... -> b (num_grouped_queries h) ...', num_grouped_queries = self.num_grouped_queries) for t in (fk, fv, fmask))
+                if self.use_diff_topk:
+                    gates = F.pad(gates, (0, 1), value = 1.)
 
-            fsim = einsum(fq, fk, 'b h i d, b h i j d -> b h i j') * self.scale
+                    fk = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fk)
+                    fv = einx.multiply('b h i w, b h i w j d -> b h i w j d', gates, fv)
 
-            fsim = fsim.masked_fill(~fmask, mask_value)
+                fk = rearrange(fk, 'b h i w j d -> b h i (w j) d')
+                fv = rearrange(fv, 'b h i w j d -> b h i (w j) d')
 
-            fattn = fsim.softmax(dim = -1)
+                # fine attention
+
+                fk, fv, fmask = tuple(repeat(t, 'b h ... -> b (num_grouped_queries h) ...', num_grouped_queries = self.num_grouped_queries) for t in (fk, fv, fmask))
+
+                fsim = einsum(fq, fk, 'b h i d, b h i j d -> b h i j') * self.scale
+
+                fsim = fsim.masked_fill(~fmask, mask_value)
+
+                fattn = fsim.softmax(dim = -1)
 
-            fine_attn_out = einsum(fattn, fv, 'b h i j, b h i j d -> b h i d')
+                fine_attn_out = einsum(fattn, fv, 'b h i j, b h i j d -> b h i d')
 
-            fine_attn_out = fine_attn_out[..., :seq_len, :]
+                fine_attn_out = fine_attn_out[..., :seq_len, :]
         else:
             # if only first block, just do a simple block causal
 
diff --git a/native_sparse_attention_pytorch/transformer.py b/native_sparse_attention_pytorch/transformer.py
@@ -196,7 +196,7 @@ def forward(
             )
 
         if not disable_flex and self.use_flex_fine_selection:
-            attn_kwargs.udpate(
+            attn_kwargs.update(
                 fine_selection_flex_mask = create_fine_mask(seq_len, self.attn_fine_block_size)
             )
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.28"
+version = "0.0.29"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/train.py b/train.py
@@ -24,7 +24,7 @@
 SEQ_LEN = 256
 
 USE_SPARSE_ATTN = True
-USE_FLEX_FOR_FINE_SELECTION = False # will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels
+USE_FLEX_FOR_FINE_SELECTION = True # will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels
 
 # experiment related
 

Original file line number	Diff line number	Diff line change
`@@ -196,7 +196,7 @@ def forward(`
`196`	`196`	`)`
`197`	`197`
`198`	`198`	`if not disable_flex and self.use_flex_fine_selection:`
`199`		`- attn_kwargs.udpate(`
	`199`	`+ attn_kwargs.update(`
`200`	`200`	`fine_selection_flex_mask = create_fine_mask(seq_len, self.attn_fine_block_size)`
`201`	`201`	`)`
`202`	`202`