Skip to content

Commit 728e584

Browse files
committed
Update default configs for 340M models
1 parent 7b099fc commit 728e584

File tree

3 files changed

+6
-6
lines changed

3 files changed

+6
-6
lines changed

configs/nsa_340M.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99
"initializer_range": 0.006,
1010
"max_position_embeddings": 8192,
1111
"model_type": "nsa",
12-
"num_heads": 64,
13-
"num_kv_heads": 4,
12+
"num_heads": 32,
13+
"num_kv_heads": 2,
14+
"head_dim": 64,
1415
"block_size": 64,
1516
"block_counts": 16,
1617
"window_size": 512,
@@ -19,5 +20,4 @@
1920
"tie_word_embeddings": false,
2021
"use_cache": true,
2122
"vocab_size": 32000
22-
}
23-
23+
}

native_sparse_attention/configuration_nsa.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def __init__(
1616
num_hidden_layers: int = 24,
1717
num_heads: int = 64,
1818
num_kv_heads: int = 4,
19-
head_dim: int = 64,
19+
head_dim: int = 32,
2020
qkv_bias: bool = False,
2121
block_size: int = 64,
2222
block_counts: Optional[int] = 16,

native_sparse_attention/modeling_nsa.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def __init__(
7070
self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
7171
self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
7272
self.g_proj = nn.Linear(self.hidden_size, self.num_heads * 3, bias=False)
73-
self.o_proj = nn.Linear(self.kv_dim, self.hidden_size, bias=False)
73+
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
7474

7575
self.rotary = RotaryEmbedding(dim=self.head_dim, base=self.rope_theta)
7676

0 commit comments

Comments
 (0)