init to sliding window strategy only

lucidrains · lucidrains · commit f1b9c2d4dc59 · 2025-02-20T10:44:23.000-08:00
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -4,8 +4,8 @@
 from math import ceil
 
 import torch
-from torch import nn, arange, stack, cat, Tensor
 import torch.nn.functional as F
+from torch import nn, arange, stack, cat, tensor, Tensor
 from torch.nn import Module, ModuleList
 
 from local_attention import LocalAttention
@@ -226,6 +226,11 @@ def __init__(
         if not exists(strategy_combine_mlp):
             strategy_combine_mlp = nn.Linear(dim, 3 * heads)
 
+            # init to sliding windows first, as network tends to pick up on local patterns first before distant ones
+
+            nn.init.zeros_(strategy_combine_mlp.weight)
+            strategy_combine_mlp.bias.data.copy_(tensor([-2., -2., 2.] * heads))
+
         self.to_strategy_combine = nn.Sequential(
             strategy_combine_mlp,
             nn.Sigmoid(),
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "native-sparse-attention-pytorch"
-version = "0.0.26"
+version = "0.0.27"
 description = "Native Sparse Attention"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }