Skip to content

Commit dfd3b15

Browse files
committed
fixes
1 parent 2cf4646 commit dfd3b15

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

train.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@
2828
GENERATE_EVERY = 500
2929
GENERATE_LENGTH = 512
3030
SEQ_LEN = 512
31-
HEAD = 8
31+
HEADS = 8
3232
KV_HEADS = 4
3333

34-
USE_SPARSE_ATTN = False
34+
USE_SPARSE_ATTN = True
3535
USE_FLEX_FOR_FINE_SELECTION = True # will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels
3636
QUERY_HEADS_SHARE_SELECTION = False # if set to False, each query head can look at a different segment of their corresponding key / value head in GQA
3737

@@ -50,7 +50,7 @@
5050

5151
PROJECT_NAME = 'native-sparse-attention'
5252
RUN_NAME = 'baseline' if not USE_SPARSE_ATTN else f'sparse-attn: compress size {COMPRESS_BLOCK_SIZE} | fine size {FINE_BLOCK_SIZE} | {NUM_FINE_SELECTED} selected'
53-
WANDB_ONLINE = True # turn this on to pipe experiment to cloud
53+
WANDB_ONLINE = False # turn this on to pipe experiment to cloud
5454

5555
# helpers
5656

0 commit comments

Comments
 (0)