fixes

lucidrains · lucidrains · commit dfd3b15844d7 · 2025-02-21T17:15:18.000Z
diff --git a/train.py b/train.py
@@ -28,10 +28,10 @@
 GENERATE_EVERY = 500
 GENERATE_LENGTH = 512
 SEQ_LEN = 512
-HEAD = 8
+HEADS = 8
 KV_HEADS = 4
 
-USE_SPARSE_ATTN = False
+USE_SPARSE_ATTN = True
 USE_FLEX_FOR_FINE_SELECTION = True   # will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels
 QUERY_HEADS_SHARE_SELECTION = False  # if set to False, each query head can look at a different segment of their corresponding key / value head in GQA
 
@@ -50,7 +50,7 @@
 
 PROJECT_NAME = 'native-sparse-attention'
 RUN_NAME = 'baseline' if not USE_SPARSE_ATTN else f'sparse-attn: compress size {COMPRESS_BLOCK_SIZE} | fine size {FINE_BLOCK_SIZE} | {NUM_FINE_SELECTED} selected'
-WANDB_ONLINE = True # turn this on to pipe experiment to cloud
+WANDB_ONLINE = False # turn this on to pipe experiment to cloud
 
 # helpers