able to generate samples in triton train script

lucidrains · lucidrains · commit 2eb7cf6c442d · 2025-02-26T11:41:36.000Z
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -318,6 +318,7 @@ def __init__(
     def forward(
         self,
         inp,
+        disable_triton_kernel = False,
         sliding_window_flex_mask = None,
         fine_selection_flex_mask = None
     ):
@@ -441,7 +442,7 @@ def forward(
                 gates = gates.cumprod(dim = -1)[..., -1]
                 gates = repeat(gates, 'b h ... -> b (h qh) ...', qh = fine_num_grouped_queries)
 
-            if self.use_triton_kernel:
+            if self.use_triton_kernel and not disable_triton_kernel:
                 from native_sparse_attention_pytorch.triton_native_sparse_attention import native_sparse_attend
 
                 fmask = selected_importance_values > 1e-10
diff --git a/native_sparse_attention_pytorch/transformer.py b/native_sparse_attention_pytorch/transformer.py
@@ -182,7 +182,8 @@ def forward(
         self,
         ids,
         return_loss = False,
-        disable_flex = False
+        disable_flex = False,
+        disable_triton_kernel = True
     ):
         if return_loss:
             ids, labels = ids[:, :-1], ids[:, 1:]
@@ -195,7 +196,9 @@ def forward(
 
         # prepare maybe flex attention masks
 
-        attn_kwargs = dict()
+        attn_kwargs = dict(
+            disable_triton_kernel = disable_triton_kernel
+        )
 
         if not disable_flex and self.use_flex_sliding_window:
             attn_kwargs.update(
diff --git a/train_triton_nsa.py b/train_triton_nsa.py
@@ -25,7 +25,6 @@
 LEARNING_RATE = 1e-4
 VALIDATE_EVERY = 100
 PRIME_LENGTH = 64
-SHOULD_GENERATE = False
 GENERATE_EVERY = 500
 GENERATE_LENGTH = 512
 SEQ_LEN = 512
@@ -100,7 +99,11 @@ def base_decoding(
     sample_num_times = max(0, seq_len - prompt_seq_len)
 
     for _ in tqdm(range(sample_num_times)):
-        logits = net(out, disable_flex = True)
+        logits = net(
+            out,
+            disable_flex = True,
+            disable_triton_kernel = True
+        )
 
         logits = logits[:, -1]
         logits = top_k(logits, thres = filter_thres)
@@ -208,7 +211,7 @@ def __getitem__(self, index):
             wandb.log(dict(valid_loss = loss.item()), step = i)
             print(f"validation loss: {loss.item():.3f}")
 
-    if SHOULD_GENERATE and i % GENERATE_EVERY == 0:
+    if i % GENERATE_EVERY == 0:
         model.eval()
 
         inp = random.choice(val_dataset)[:PRIME_LENGTH]