- Remove batch_size=1 override from all accuracy test functions

dgolubovicTT · dgolubovicTT · commit 3835256072f0 · 2026-02-11T13:31:24.000Z
- Update teacher forcing to replicate ground truth across batch dimension
- Add --accuracy-testing pytest option (usage: --accuracy-testing true)
- Remove default values from accuracy_testing parameters to allow fixture injection
diff --git a/benchmark/tt-xla/conftest.py b/benchmark/tt-xla/conftest.py
@@ -162,6 +162,13 @@ def pytest_addoption(parser):
         type=make_validator_boolean("--experimental-compile"),
         help="Enable experimental compile flag (true/false). Overrides config value.",
     )
+    parser.addoption(
+        "--accuracy-testing",
+        action="store",
+        default=None,
+        type=make_validator_boolean("--accuracy-testing"),
+        help="Enable accuracy testing mode (true/false). Uses reference data for TOP1/TOP5 accuracy.",
+    )
 
 
 @pytest.fixture
@@ -217,3 +224,9 @@ def task(request):
 @pytest.fixture
 def experimental_compile(request):
     return request.config.getoption("--experimental-compile")
+
+
+@pytest.fixture
+def accuracy_testing(request):
+    value = request.config.getoption("--accuracy-testing")
+    return value if value is not None else False
diff --git a/benchmark/tt-xla/llm_benchmark.py b/benchmark/tt-xla/llm_benchmark.py
@@ -226,7 +226,7 @@ def generate_and_benchmark(
             logits = read_logits_fn(output).to("cpu")
             output_logits.append(logits)
             next_token_ids = logits[:, -1].argmax(dim=-1)
-            predicted_token = next_token_ids[0].item()  # Assuming batch_size=1
+            predicted_token = next_token_ids[0].item()  # Extract from batch[0] (all items identical in accuracy mode)
             predicted_tokens.append(predicted_token)
 
             output_text = [tokenizer.decode(token_id) for token_id in next_token_ids]
@@ -245,8 +245,12 @@ def generate_and_benchmark(
             # Update inputs for next iteration
             if ground_truth_tokens is not None:
                 # Teacher forcing: use ground truth token as next input
+                # Replicate ground truth token for all batch items (they're all identical)
+                batch_size = input_args["input_ids"].shape[0]
                 gt_token = ground_truth_tokens[step]
-                input_args["input_ids"] = gt_token.unsqueeze(0).unsqueeze(0).to(device)  # Shape: [1, 1]
+                input_args["input_ids"] = (
+                    gt_token.unsqueeze(0).unsqueeze(0).expand(batch_size, 1).to(device)
+                )  # Shape: [batch_size, 1]
             else:
                 # Standard generation: use predicted token as next input
                 input_args["input_ids"] = next_token_ids.unsqueeze(-1).to(device)
diff --git a/benchmark/tt-xla/test_llms.py b/benchmark/tt-xla/test_llms.py