You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: train_triton_nsa.py
+6-5Lines changed: 6 additions & 5 deletions
Original file line number
Diff line number
Diff line change
@@ -25,24 +25,25 @@
25
25
LEARNING_RATE=1e-4
26
26
VALIDATE_EVERY=100
27
27
PRIME_LENGTH=64
28
+
SHOULD_GENERATE=False
28
29
GENERATE_EVERY=500
29
30
GENERATE_LENGTH=512
30
31
SEQ_LEN=512
31
32
HEADS=8
32
-
KV_HEADS=8
33
+
KV_HEADS=4
33
34
34
35
USE_SPARSE_ATTN=True
35
36
USE_TRITON_NSA=True
36
-
USE_FLEX_FOR_FINE_SELECTION=False# will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels
37
-
QUERY_HEADS_SHARE_SELECTION=False# if set to False, each query head can look at a different segment of their corresponding key / value head in GQA
37
+
USE_FLEX_FOR_FINE_SELECTION=False# will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels
38
+
QUERY_HEADS_SHARE_SELECTION=True# if set to False, each query head can look at a different segment of their corresponding key / value head in GQA
0 commit comments