tenstorrent
diff --git a/‎.github/workflows/perf-bench-matrix.json‎
Lines changed: 105 additions & 0 deletions b/‎.github/workflows/perf-bench-matrix.json‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎benchmark/tt-xla/generate_reference_outputs.py‎
Lines changed: 212 additions & 0 deletions b/‎benchmark/tt-xla/generate_reference_outputs.py‎
Lines changed: 212 additions & 0 deletions
@@ -281,6 +281,111 @@
         "name": "unet_for_conditional_generation",
         "pyreq": "accelerate datasets diffusers==0.36.0 loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
         "pytest": "benchmark/tt-xla/test_encoders.py::test_unet_for_conditional_generation"
+      },
+      {
+        "name": "llama_3_2_1b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_1b_accuracy"
+      },
+      {
+        "name": "llama_3_2_3b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_3b_accuracy"
+      },
+      {
+        "name": "llama_3_1_8b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_1_8b_accuracy"
+      },
+      {
+        "name": "mistral_7b_accuracy",
+        "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1 protobuf sentencepiece",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_mistral_7b_accuracy"
+      },
+      {
+        "name": "qwen_2_5_7b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_7b_accuracy"
+      },
+      {
+        "name": "google_gemma-1.1-2b-it_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_gemma_1_1_2b_accuracy"
+      },
+      {
+        "name": "google_gemma-2-2b-it_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_gemma_2_2b_accuracy"
+      },
+      {
+        "name": "microsoft_phi-1_accuracy",
+        "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_phi1_accuracy"
+      },
+      {
+        "name": "microsoft_phi-1_5_accuracy",
+        "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_phi1_5_accuracy"
+      },
+      {
+        "name": "microsoft_phi-2_accuracy",
+        "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_phi2_accuracy"
+      },
+      {
+        "name": "tiiuae_falcon3-1b-base_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_1b_accuracy"
+      },
+      {
+        "name": "tiiuae_falcon3-3b-base_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_3b_accuracy"
+      },
+      {
+        "name": "tiiuae_falcon3-7b-base_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_7b_accuracy"
+      },
+      {
+        "name": "qwen_2_5_0_5b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_0_5b_accuracy"
+      },
+      {
+        "name": "qwen_2_5_1_5b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_1_5b_accuracy"
+      },
+      {
+        "name": "qwen_2_5_3b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_3b_accuracy"
+      },
+      {
+        "name": "qwen_3_0_6b_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_0_6b_accuracy"
+      },
+      {
+        "name": "qwen_3_1_7b_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_1_7b_accuracy"
+      },
+      {
+        "name": "qwen_3_4b_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_4b_accuracy"
+      },
+      {
+        "name": "qwen_3_8b_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_8b_accuracy"
+      },
+      {
+        "name": "ministral_8b_accuracy",
+        "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_ministral_8b_accuracy"
       }
     ]
   }
 
@@ -0,0 +1,212 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Generate reference outputs for LLM accuracy testing.
+
+This script loads a HuggingFace model, runs it on the "Tale of Two Cities" text corpus,
+and generates a .refpt file containing reference tokens and top-5 predictions for each position.
+
+The .refpt files are used by the TokenAccuracy class for measuring TOP1 and TOP5 accuracy
+during model inference testing.
+
+Usage:
+    python3 benchmark/tt-xla/generate_reference_outputs.py \\
+        --model "meta-llama/Llama-3.2-1B-Instruct" \\
+        --output_file "benchmark/tt-xla/reference_outputs/Llama-3.2-1B-Instruct.refpt" \\
+        --total_length 1024
+
+Output format (.refpt file):
+    {
+        'reference_tokens': torch.Tensor,  # Shape: [1, total_length]
+        'top5_tokens': torch.Tensor,       # Shape: [total_length, 5]
+    }
+"""
+
+import argparse
+import bz2
+import os
+
+import torch
+from loguru import logger
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+
+def generate_reference_outputs(total_length, output_file, model_name):
+    """
+    Generate reference outputs for accuracy testing using HuggingFace models.
+
+    Args:
+        total_length: Number of tokens to process from Tale of Two Cities
+        output_file: Path to save .refpt file
+        model_name: HuggingFace model name (e.g., 'meta-llama/Llama-3.2-1B-Instruct')
+    """
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logger.info(f"Using device: {device}")
+
+    # Load model and tokenizer from HuggingFace
+    config = AutoConfig.from_pretrained(model_name)
+
+    # Qwen only: add rope scaling to the config, for long context support.
+    # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct#processing-long-texts
+    if "Qwen" in model_name:
+        config.rope_scaling = {"factor": 4.0, "original_max_position_embeddings": 32768, "type": "yarn"}
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name, config=config, device_map="auto")
+    model.eval()
+
+    # Load the book text
+    current_file_path = os.path.abspath(__file__)
+    current_file_dir = os.path.dirname(current_file_path)
+    prompt_file = os.path.join(current_file_dir, "reference_outputs", "tale-of-two-cities.txt.bz2")
+
+    if not os.path.exists(prompt_file):
+        raise FileNotFoundError(
+            f"Tale of Two Cities text file not found: {prompt_file}\n"
+            f"Please ensure the file exists in the reference_outputs directory."
+        )
+
+    logger.info(f"Loading text from {prompt_file}")
+    with bz2.open(prompt_file, "rt", encoding="utf-8") as f:
+        text = f.read()
+
+    # Encode text to tokens
+    encoded_tokens = tokenizer.encode(text, add_special_tokens=True)[:total_length]
+    encoded_tokens_tensor = torch.tensor(encoded_tokens, device=device).unsqueeze(0)  # Shape [1, seq_len] on device
+
+    logger.info(f"Processing {len(encoded_tokens)} tokens")
+    logger.info(f"Model: {model_name}")
+    logger.info(f"Output file: {output_file}")
+
+    print(f"{'Progress':<15}{'Correct':<8}{'Actual':<15}{'Top 5 Predictions':<75}")
+    print("-" * 113)
+
+    # Initialize lists to store results
+    all_top1_correct = []
+    all_top5_correct = []
+    all_top5_tokens = []
+    segment_accuracies = []
+    chunk_size = 1024
+
+    with torch.no_grad():
+        for chunk_start in range(0, total_length - 1, chunk_size):
+            chunk_end = min(chunk_start + chunk_size, total_length)
+            # Get input and target chunks
+            chunk_tokens = encoded_tokens_tensor[:, chunk_start:chunk_end]
+            chunk_next_tokens = encoded_tokens[chunk_start + 1 : chunk_end + 1]
+            actual_chunk_size = min(len(chunk_tokens[0]), len(chunk_next_tokens))
+
+            # Trim input chunk if needed
+            chunk_tokens = chunk_tokens[:, :actual_chunk_size]
+
+            # Process chunk using HuggingFace model
+            outputs = model(chunk_tokens.to(device))
+            logits = outputs.logits
+
+            # Compute top-5 predictions
+            probs = torch.softmax(logits, dim=-1)
+            _, chunk_top5_tokens = torch.topk(probs, k=5, dim=-1)  # Shape: [1, chunk_size, 5]
+            chunk_top5_tokens = chunk_top5_tokens.squeeze(0)  # Shape: [chunk_size, 5]
+
+            # Get next tokens tensor
+            chunk_next_tokens_tensor = torch.tensor(
+                chunk_next_tokens[:actual_chunk_size], device=device
+            )  # Move to same device
+
+            # Calculate correctness
+            chunk_top1_correct = chunk_top5_tokens[:, 0] == chunk_next_tokens_tensor
+            chunk_top5_correct = torch.any(chunk_top5_tokens == chunk_next_tokens_tensor.unsqueeze(1), dim=1)
+
+            # Store results
+            all_top1_correct.extend(chunk_top1_correct.tolist())
+            all_top5_correct.extend(chunk_top5_correct.tolist())
+            all_top5_tokens.append(chunk_top5_tokens)
+
+            # Print predictions for this chunk
+            for i in range(len(chunk_next_tokens)):
+                global_pos = chunk_start + i
+                next_token = chunk_next_tokens[i]
+
+                sanitize = lambda x: x.replace("\n", "").replace("\r", "").replace("\x0c", "")
+                actual_token = sanitize(tokenizer.decode([next_token]))
+                top5_tokens = [sanitize(tokenizer.decode([t.item()])) for t in chunk_top5_tokens[i]]
+                correct = "x" if chunk_top1_correct[i] else ("-" if chunk_top5_correct[i] else " ")
+                top5_str = " ".join(f"{t:<14}" for t in top5_tokens)
+
+                progress_str = f"{global_pos+1}/{total_length-1}"
+                print(f"{progress_str:<15}{correct:<8}{actual_token:<15}{top5_str}")
+
+                # Calculate and store segment accuracies every 100 tokens
+                if (global_pos + 1) % 100 == 0 or global_pos == total_length - 2:
+                    start_idx = (global_pos // 100) * 100
+                    end_idx = min(start_idx + 100, len(all_top1_correct))
+                    segment_top1_acc = sum(all_top1_correct[start_idx:end_idx]) / (end_idx - start_idx) * 100
+                    segment_top5_acc = sum(all_top5_correct[start_idx:end_idx]) / (end_idx - start_idx) * 100
+                    if len(segment_accuracies) <= global_pos // 100:
+                        segment_accuracies.append((segment_top1_acc, segment_top5_acc))
+
+    # Save the data - ensure tensors are concatenated and on CPU
+    data = {
+        "top5_tokens": torch.cat(all_top5_tokens, dim=0).cpu(),
+        "reference_tokens": encoded_tokens_tensor[:, :total_length].clone().cpu(),
+    }
+
+    torch.save(data, output_file)
+    logger.info(f"Saved reference outputs to {output_file}")
+
+    # Print all segment accuracy summaries as a table
+    print("\nSegment Accuracy Summaries:")
+    print(f"{'Tokens':<15}{'Top-1 Accuracy':<20}{'Top-5 Accuracy':<20}")
+    print("-" * 55)
+    for i, (top1_acc, top5_acc) in enumerate(segment_accuracies):
+        start_token = i * 100 + 1
+        end_token = min((i + 1) * 100, total_length)
+        print(f"{f'{start_token}-{end_token}':<15}{f'{top1_acc:.2f}%':<20}{f'{top5_acc:.2f}%':<20}")
+
+    # Calculate overall accuracy
+    overall_top1_acc = sum(acc[0] for acc in segment_accuracies) / len(segment_accuracies)
+    overall_top5_acc = sum(acc[1] for acc in segment_accuracies) / len(segment_accuracies)
+    print("-" * 55)
+    print(f"{'Overall':<15}{f'{overall_top1_acc:.2f}%':<20}{f'{overall_top5_acc:.2f}%':<20}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate reference outputs for LLM accuracy testing using HuggingFace models.",
+        epilog="""
+Examples:
+    # Generate reference for Llama 3.2 1B
+    python3 benchmark/tt-xla/generate_reference_outputs.py \\
+        --model "meta-llama/Llama-3.2-1B-Instruct" \\
+        --output_file "benchmark/tt-xla/reference_outputs/Llama-3.2-1B-Instruct.refpt"
+
+    # Generate with custom length
+    python3 benchmark/tt-xla/generate_reference_outputs.py \\
+        --model "mistralai/Mistral-7B-Instruct-v0.3" \\
+        --output_file "benchmark/tt-xla/reference_outputs/Mistral-7B-Instruct-v0.3.refpt" \\
+        --total_length 2048
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--total_length", type=int, default=1024, help="Total length of tokens to process (default: 1024)"
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        required=True,
+        help="Output file path for reference data (e.g., 'benchmark/tt-xla/reference_outputs/ModelName.refpt')",
+    )
+    parser.add_argument(
+        "--model", type=str, required=True, help="HuggingFace model name (e.g., 'meta-llama/Llama-3.2-1B-Instruct')"
+    )
+    args = parser.parse_args()
+
+    generate_reference_outputs(total_length=args.total_length, output_file=args.output_file, model_name=args.model)
+
+
+if __name__ == "__main__":
+    main()