Update generate.py

mikekgfb · web-flow · commit 8e18e7fc6875 · 2025-01-23T18:52:16.000-08:00
Push backend manager into caller
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -532,7 +532,6 @@ def decode_n_tokens(
         callback=lambda _: _,
         eos_token_id: int = 2,
         eot_id: Optional[int] = None,
-        attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         **sampling_kwargs,
     ):
         new_tokens, new_probs = [], []
@@ -541,7 +540,8 @@ def decode_n_tokens(
             num_new_tokens - 1
         ):  # -1 to save space to run an EoS if dont generate it naturally
             # Actually better for Inductor to codegen attention here
-            with torch.nn.attention.sdpa_kernel([attention_backend]):
+            # with torch.nn.attention.sdpa_kernel([attention_backend]):
+            if True: # preserve indentation while testing
 
                 out_token = cur_token.clone()
                 next_token, next_prob = self.decode_one_token(
@@ -685,7 +685,6 @@ def generate(
         sequential_prefill=True,
         callback=lambda x: x,
         max_seq_length: int,
-        attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         seed: Optional[int] = None,
         **sampling_kwargs,
     ) -> torch.Tensor:
@@ -802,7 +801,6 @@ def generate(
                     if self.is_llama3_model
                     else None
                 ),
-                attention_backend=attention_backend,
                 **sampling_kwargs,
             ):
                 generated_tokens.append(generated_token.view(-1))
@@ -1174,7 +1172,7 @@ def callback(x, *, done_generating=False):
                 prof = torch.profiler.profile()
             t0 = time.perf_counter()
             num_tokens_generated = 0
-            with prof:
+            with torch.nn.attention.sdpa_kernel([self.builder_args.attention_backend]), prof:
                 generator_func = self.generate(
                     self.model,
                     encoded,
@@ -1190,7 +1188,6 @@ def callback(x, *, done_generating=False):
                     start_pos=start_pos,
                     skip_cache_setup=not is_first_sample,
                     max_seq_length=max_seq_length,
-                    attention_backend=self.builder_args.attention_backend,
                 )
                 if generator_args.chat_mode:
                     start_pos += encoded.size(0)