Adressing PR comments

dgolubovicTT · dgolubovicTT · commit e3e992532a46 · 2026-02-09T10:42:55.000Z
- Add benchmark/tt-xla/scripts/generate_reference_outputs.py script
- Remove ground truth from warmup
- Added utility function for initializing accuracy testing: benchmark/tt-xla/utils.py::initialize_accuracy_testing
diff --git a/benchmark/tt-xla/llm_benchmark.py b/benchmark/tt-xla/llm_benchmark.py
@@ -32,8 +32,8 @@
     create_benchmark_result,
     compute_pcc,
     build_xla_export_name,
+    initialize_accuracy_testing,
 )
-from token_accuracy import TokenAccuracy
 
 xr.set_device_type("TT")
 
@@ -388,27 +388,10 @@ def benchmark_llm_torch_xla(
     token_accuracy = None
     custom_input_prompt = None
     if accuracy_testing:
-        if model_name_for_accuracy is None:
-            raise ValueError("model_name_for_accuracy must be provided when accuracy_testing=True")
-
-        # Use half the cache for prefill, half for decode
-        # This ensures we fit within hardware constraints
-        max_prefill = max_cache_len // 2
-        max_decode = max_cache_len // 2
-
-        token_accuracy = TokenAccuracy(
-            model_name=model_name_for_accuracy,
-            max_prefill_tokens=max_prefill,
-            max_decode_tokens=max_decode,
-        )
-
-        # Get Tale of Two Cities text from reference data
-        custom_input_prompt = token_accuracy.prepare_ref_tokens(tokenizer)
-        print(
-            f"Using reference text for accuracy testing:"
-            f"\n  Max prefill: {max_prefill} tokens"
-            f"\n  Max decode: {max_decode} tokens"
-            f"\n  Text preview: {custom_input_prompt[:100]}..."
+        token_accuracy, custom_input_prompt = initialize_accuracy_testing(
+            model_name_for_accuracy=model_name_for_accuracy,
+            max_cache_len=max_cache_len,
+            tokenizer=tokenizer,
         )
 
     # Construct inputs, including static cache
@@ -477,7 +460,6 @@ def benchmark_llm_torch_xla(
     # Warmup run
     print("Warming up...")
     warmup_tokens = min(MIN_STEPS, max_tokens_to_generate)
-    ground_truth_for_warmup = token_accuracy.reference_tokens[:warmup_tokens] if accuracy_testing else None
     _, _, _ = generate_and_benchmark(
         compiled_model,
         input_args,
@@ -488,7 +470,6 @@ def benchmark_llm_torch_xla(
         verbose=False,
         is_multichip=is_multichip,
         mesh=mesh,
-        ground_truth_tokens=ground_truth_for_warmup,
     )
 
     # Reconstruct inputs for the actual benchmark run
diff --git a/benchmark/tt-xla/scripts/generate_reference_outputs.py b/benchmark/tt-xla/scripts/generate_reference_outputs.py
@@ -0,0 +1,220 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Generate reference outputs for LLM accuracy testing.
+
+This script loads a HuggingFace model, runs it on the "Tale of Two Cities" text corpus,
+and generates a .refpt file containing reference tokens and top-5 predictions for each position.
+
+The .refpt files are used by the TokenAccuracy class for measuring TOP1 and TOP5 accuracy
+during model inference testing.
+
+Usage:
+    python3 <path-to-script>/generate_reference_outputs.py \\
+        --model "meta-llama/Llama-3.2-1B-Instruct" \\
+        --output_file "<output-dir>/Llama-3.2-1B-Instruct.refpt" \\
+        --total_length 1024
+
+Output format (.refpt file):
+    {
+        'reference_tokens': torch.Tensor,  # Shape: [1, total_length]
+        'top5_tokens': torch.Tensor,       # Shape: [total_length, 5]
+    }
+"""
+
+import argparse
+import bz2
+import os
+
+import torch
+import transformers
+from loguru import logger
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+
+def generate_reference_outputs(total_length, output_file, model_name):
+    """
+    Generate reference outputs for accuracy testing using HuggingFace models.
+
+    Args:
+        total_length: Number of tokens to process from Tale of Two Cities
+        output_file: Path to save .refpt file
+        model_name: HuggingFace model name (e.g., 'meta-llama/Llama-3.2-1B-Instruct')
+    """
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logger.info(f"Using device: {device}")
+
+    # Load model and tokenizer from HuggingFace
+    config = AutoConfig.from_pretrained(model_name)
+
+    # Qwen only: add rope scaling to the config, for long context support.
+    # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct#processing-long-texts
+    if "Qwen" in model_name:
+        config.rope_scaling = {"factor": 4.0, "original_max_position_embeddings": 32768, "type": "yarn"}
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name, config=config, device_map="auto")
+    model.eval()
+
+    # Load the book text - look in ../reference_outputs relative to script location
+    current_file_path = os.path.abspath(__file__)
+    current_file_dir = os.path.dirname(current_file_path)
+    # Navigate up to parent directory and look for reference_outputs
+    parent_dir = os.path.dirname(current_file_dir)
+    prompt_file = os.path.join(parent_dir, "reference_outputs", "tale-of-two-cities.txt.bz2")
+
+    if not os.path.exists(prompt_file):
+        raise FileNotFoundError(
+            f"Tale of Two Cities text file not found: {prompt_file}\n"
+            f"Please ensure tale-of-two-cities.txt.bz2 exists in the reference_outputs directory."
+        )
+
+    logger.info(f"Loading text from {prompt_file}")
+    with bz2.open(prompt_file, "rt", encoding="utf-8") as f:
+        text = f.read()
+
+    # Encode text to tokens
+    encoded_tokens = tokenizer.encode(text, add_special_tokens=True)[:total_length]
+    encoded_tokens_tensor = torch.tensor(encoded_tokens, device=device).unsqueeze(0)  # Shape [1, seq_len] on device
+
+    logger.info(f"Processing {len(encoded_tokens)} tokens")
+    logger.info(f"Model: {model_name}")
+    logger.info(f"Output file: {output_file}")
+
+    print(f"{'Progress':<15}{'Correct':<8}{'Actual':<15}{'Top 5 Predictions':<75}")
+    print("-" * 113)
+
+    # Initialize lists to store results
+    all_top1_correct = []
+    all_top5_correct = []
+    all_top5_tokens = []
+    segment_accuracies = []
+    chunk_size = 1024
+
+    with torch.no_grad():
+        for chunk_start in range(0, total_length - 1, chunk_size):
+            chunk_end = min(chunk_start + chunk_size, total_length)
+            # Get input and target chunks
+            chunk_tokens = encoded_tokens_tensor[:, chunk_start:chunk_end]
+            chunk_next_tokens = encoded_tokens[chunk_start + 1 : chunk_end + 1]
+            actual_chunk_size = min(len(chunk_tokens[0]), len(chunk_next_tokens))
+
+            # Trim input chunk if needed
+            chunk_tokens = chunk_tokens[:, :actual_chunk_size]
+
+            # Process chunk using HuggingFace model
+            outputs = model(chunk_tokens.to(device))
+            logits = outputs.logits
+
+            # Compute top-5 predictions
+            probs = torch.softmax(logits, dim=-1)
+            _, chunk_top5_tokens = torch.topk(probs, k=5, dim=-1)  # Shape: [1, chunk_size, 5]
+            chunk_top5_tokens = chunk_top5_tokens.squeeze(0)  # Shape: [chunk_size, 5]
+
+            # Get next tokens tensor
+            chunk_next_tokens_tensor = torch.tensor(
+                chunk_next_tokens[:actual_chunk_size], device=device
+            )  # Move to same device
+
+            # Calculate correctness
+            chunk_top1_correct = chunk_top5_tokens[:, 0] == chunk_next_tokens_tensor
+            chunk_top5_correct = torch.any(chunk_top5_tokens == chunk_next_tokens_tensor.unsqueeze(1), dim=1)
+
+            # Store results
+            all_top1_correct.extend(chunk_top1_correct.tolist())
+            all_top5_correct.extend(chunk_top5_correct.tolist())
+            all_top5_tokens.append(chunk_top5_tokens)
+
+            # Print predictions for this chunk
+            for i in range(len(chunk_next_tokens)):
+                global_pos = chunk_start + i
+                next_token = chunk_next_tokens[i]
+
+                sanitize = lambda x: x.replace("\n", "").replace("\r", "").replace("\x0c", "")
+                actual_token = sanitize(tokenizer.decode([next_token]))
+                top5_tokens = [sanitize(tokenizer.decode([t.item()])) for t in chunk_top5_tokens[i]]
+                correct = "x" if chunk_top1_correct[i] else ("-" if chunk_top5_correct[i] else " ")
+                top5_str = " ".join(f"{t:<14}" for t in top5_tokens)
+
+                progress_str = f"{global_pos+1}/{total_length-1}"
+                print(f"{progress_str:<15}{correct:<8}{actual_token:<15}{top5_str}")
+
+                # Calculate and store segment accuracies every 100 tokens
+                if (global_pos + 1) % 100 == 0 or global_pos == total_length - 2:
+                    start_idx = (global_pos // 100) * 100
+                    end_idx = min(start_idx + 100, len(all_top1_correct))
+                    segment_top1_acc = sum(all_top1_correct[start_idx:end_idx]) / (end_idx - start_idx) * 100
+                    segment_top5_acc = sum(all_top5_correct[start_idx:end_idx]) / (end_idx - start_idx) * 100
+                    if len(segment_accuracies) <= global_pos // 100:
+                        segment_accuracies.append((segment_top1_acc, segment_top5_acc))
+
+    # Save the data - ensure tensors are concatenated and on CPU
+    data = {
+        "top5_tokens": torch.cat(all_top5_tokens, dim=0).cpu(),
+        "reference_tokens": encoded_tokens_tensor[:, :total_length].clone().cpu(),
+        "library_versions": {
+            "torch": torch.__version__,
+            "transformers": transformers.__version__,
+        },
+    }
+
+    torch.save(data, output_file)
+    logger.info(f"Saved reference outputs to {output_file}")
+    logger.info(f"Library versions: torch={torch.__version__}, transformers={transformers.__version__}")
+
+    # Print all segment accuracy summaries as a table
+    print("\nSegment Accuracy Summaries:")
+    print(f"{'Tokens':<15}{'Top-1 Accuracy':<20}{'Top-5 Accuracy':<20}")
+    print("-" * 55)
+    for i, (top1_acc, top5_acc) in enumerate(segment_accuracies):
+        start_token = i * 100 + 1
+        end_token = min((i + 1) * 100, total_length)
+        print(f"{f'{start_token}-{end_token}':<15}{f'{top1_acc:.2f}%':<20}{f'{top5_acc:.2f}%':<20}")
+
+    # Calculate overall accuracy
+    overall_top1_acc = sum(acc[0] for acc in segment_accuracies) / len(segment_accuracies)
+    overall_top5_acc = sum(acc[1] for acc in segment_accuracies) / len(segment_accuracies)
+    print("-" * 55)
+    print(f"{'Overall':<15}{f'{overall_top1_acc:.2f}%':<20}{f'{overall_top5_acc:.2f}%':<20}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate reference outputs for LLM accuracy testing using HuggingFace models.",
+        epilog="""
+Examples:
+    # Generate reference for Llama 3.2 1B
+    python3 generate_reference_outputs.py \\
+        --model "meta-llama/Llama-3.2-1B-Instruct" \\
+        --output_file "../reference_outputs/Llama-3.2-1B-Instruct.refpt"
+
+    # Generate with custom length
+    python3 generate_reference_outputs.py \\
+        --model "mistralai/Mistral-7B-Instruct-v0.3" \\
+        --output_file "../reference_outputs/Mistral-7B-Instruct-v0.3.refpt" \\
+        --total_length 2048
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--total_length", type=int, default=1024, help="Total length of tokens to process (default: 1024)"
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        required=True,
+        help="Output file path for reference data (e.g., '../reference_outputs/ModelName.refpt')",
+    )
+    parser.add_argument(
+        "--model", type=str, required=True, help="HuggingFace model name (e.g., 'meta-llama/Llama-3.2-1B-Instruct')"
+    )
+    args = parser.parse_args()
+
+    generate_reference_outputs(total_length=args.total_length, output_file=args.output_file, model_name=args.model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/tt-xla/token_accuracy.py b/benchmark/tt-xla/token_accuracy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
 #
 # SPDX-License-Identifier: Apache-2.0
 
diff --git a/benchmark/tt-xla/utils.py b/benchmark/tt-xla/utils.py
@@ -464,3 +464,48 @@ def move_to_cpu(data):
         moved = [move_to_cpu(item) for item in data]
         return type(data)(moved)
     return data
+
+
+def initialize_accuracy_testing(model_name_for_accuracy: str, max_cache_len: int, tokenizer):
+    """
+    Initialize token accuracy testing for LLM benchmarks.
+
+    Args:
+        model_name_for_accuracy: Model name for .refpt file lookup
+        max_cache_len: Maximum cache length (determines prefill and decode splits)
+        tokenizer: HuggingFace tokenizer instance
+
+    Returns:
+        Tuple of (token_accuracy, custom_input_prompt)
+            - token_accuracy: TokenAccuracy instance
+            - custom_input_prompt: Reference text string for benchmarking
+
+    Raises:
+        ValueError: If model_name_for_accuracy is None
+    """
+    from token_accuracy import TokenAccuracy
+
+    if model_name_for_accuracy is None:
+        raise ValueError("model_name_for_accuracy must be provided when accuracy_testing=True")
+
+    # Use half the cache for prefill, half for decode
+    # This ensures we fit within hardware constraints
+    max_prefill = max_cache_len // 2
+    max_decode = max_cache_len // 2
+
+    token_accuracy = TokenAccuracy(
+        model_name=model_name_for_accuracy,
+        max_prefill_tokens=max_prefill,
+        max_decode_tokens=max_decode,
+    )
+
+    # Get Tale of Two Cities text from reference data
+    custom_input_prompt = token_accuracy.prepare_ref_tokens(tokenizer)
+    print(
+        f"Using reference text for accuracy testing:"
+        f"\n  Max prefill: {max_prefill} tokens"
+        f"\n  Max decode: {max_decode} tokens"
+        f"\n  Text preview: {custom_input_prompt[:100]}..."
+    )
+
+    return token_accuracy, custom_input_prompt

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC`
	`1`	`+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: Apache-2.0`
`4`	`4`