Fix teacher-forced decode loop: avoid scalar-constant specialization and cache_position layout drift

dgolubovicTT · dgolubovicTT · commit 7aa40585ec76 · 2026-02-23T16:53:10.000Z
Teacher forcing was feeding a per-step scalar token (ground_truth_tokens[step].to(device)).
On XLA-style backends this commonly takes the scalar-constant path, which can specialize the
compiled program on the token value. In decode this produces many unique programs (one per
token) and can blow instruction/L1 caches.

Fix by slicing on CPU to a stable-shaped tensor [1,1] each step and transferring it as runtime
data. Expand to [batch,1] and materialize a contiguous buffer to avoid broadcast/stride issues.

cache_position updates done on-device produced an si32 buffer with a different (non-tiled)
layout than the compiled model expects (tiled si32), leading to TTIR to TTNN compilation failure
on Gemma. Fix by round-tripping cache_position through CPU: normalize to shape [1] via
reshape(-1)[-1:], increment on host, then re-upload so the device import path restores the
expected layout.
diff --git a/benchmark/tt-xla/decode_utils.py b/benchmark/tt-xla/decode_utils.py
@@ -77,6 +77,9 @@ def teacher_forced_generate(
 
     assert_eval_no_dropout(model, verbose=verbose)
 
+    # Capture batch size before the loop replaces input_args["input_ids"].
+    batch_size = input_args["input_ids"].shape[0]
+
     output_tokens: list[list[str]] = []
     output_logits: list[torch.Tensor] = []
     predicted_tokens: list[int] = []
@@ -97,22 +100,16 @@ def teacher_forced_generate(
                 output_text = [tokenizer.decode(token_id) for token_id in next_token_ids]
                 output_tokens.append(output_text)
 
-            # Teacher forcing update: feed ground truth for next step.
-            if step < ground_truth_tokens.shape[0]:
-                batch_size = input_args["input_ids"].shape[0]
-                gt_token = ground_truth_tokens[step].to(device)
-                input_args["input_ids"] = gt_token.view(1, 1).expand(batch_size, 1).contiguous()
-            else:
-                # If caller asks for more steps than ground truth provides, keep feeding last GT token.
-                batch_size = input_args["input_ids"].shape[0]
-                gt_token = ground_truth_tokens[-1].to(device)
-                input_args["input_ids"] = gt_token.view(1, 1).expand(batch_size, 1).contiguous()
-
-            host_cache_pos = input_args["cache_position"].to("cpu")
-            host_cache_pos = torch.tensor([host_cache_pos[-1:] + 1])
-            input_args["cache_position"] = host_cache_pos.to(device)
+            # Teacher forcing: keep token as runtime data (stable shape) to avoid scalar-constant specialization.
+            next_tok_host = ground_truth_tokens[step : step + 1].view(1, 1)  # CPU [1,1]
+            input_args["input_ids"] = next_tok_host.expand(batch_size, 1).contiguous().to(device)
+
+            # cache_position: host normalize/update to keep a stable [1] shape.
+            host_cache_pos = input_args["cache_position"].to("cpu").reshape(-1)[-1:]  # CPU [1]
+            input_args["cache_position"] = (host_cache_pos + 1).to(device)
 
             iteration_times_ns.append(time.perf_counter_ns() - start)
+
             if verbose:
                 print(f"Iteration\t{step}/{max_tokens_to_generate}\ttook {iteration_times_ns[-1] / 1e6:.04} ms")