tenstorrent
diff --git a/‎.github/workflows/call-perf-test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/call-perf-test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/perf-bench-matrix.json‎
Lines changed: 131 additions & 0 deletions b/‎.github/workflows/perf-bench-matrix.json‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎.github/workflows/perf-benchmark-experimental.yml‎
Lines changed: 32 additions & 0 deletions b/‎.github/workflows/perf-benchmark-experimental.yml‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎benchmark/tt-xla/conftest.py‎
Lines changed: 13 additions & 0 deletions b/‎benchmark/tt-xla/conftest.py‎
Lines changed: 13 additions & 0 deletions
@@ -197,7 +197,7 @@ jobs:
           python benchmark/benchmark.py -p ${{ matrix.build.project}} -m ${{ matrix.build.name }} -bs ${{ matrix.build.bs }} -df ${{ matrix.build.df }} -lp ${{ matrix.build.lp }} ${{ matrix.build.input_sequence_length && format('-isl {0}', matrix.build.input_sequence_length) }} -ts ${{ matrix.build.ts }} -o ${{ steps.strings.outputs.perf_report_json_file }} ${{ inputs.run_id_source && format('-r {0}', inputs.run_id_source) }}
         else
           # Run with pytest
-          pytest -svv "${{ matrix.build.pytest }}" --output-file=${{ steps.strings.outputs.perf_report_json_file }}
+          pytest -svv "${{ matrix.build.pytest }}" ${{ matrix.build.accuracy-testing && '--accuracy-testing true' || '' }} ${{ matrix.build['batch-size'] && format('--batch-size {0}', matrix.build['batch-size']) || '' }} --output-file=${{ steps.strings.outputs.perf_report_json_file }}
         fi
 
     - name: Dump stablehlo to report
 
@@ -305,6 +305,137 @@
         "name": "unet_for_conditional_generation",
         "pyreq": "accelerate datasets diffusers==0.36.0 loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
         "pytest": "benchmark/tt-xla/test_encoders.py::test_unet_for_conditional_generation"
+      },
+      {
+        "name": "llama_3_2_1b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_1b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "llama_3_2_3b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_3b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "llama_3_1_8b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_1_8b",
+        "accuracy-testing": true,
+        "batch-size": 16
+      },
+      {
+        "name": "mistral_7b_accuracy",
+        "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1 protobuf sentencepiece",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_mistral_7b",
+        "accuracy-testing": true,
+        "batch-size": 8
+      },
+      {
+        "name": "qwen_2_5_7b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_7b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "google_gemma-1.1-2b-it_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_gemma_1_1_2b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "google_gemma-2-2b-it_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_gemma_2_2b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "microsoft_phi-1_accuracy",
+        "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_phi1",
+        "accuracy-testing": true
+      },
+      {
+        "name": "microsoft_phi-1_5_accuracy",
+        "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_phi1_5",
+        "accuracy-testing": true
+      },
+      {
+        "name": "microsoft_phi-2_accuracy",
+        "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_phi2",
+        "accuracy-testing": true
+      },
+      {
+        "name": "tiiuae_falcon3-1b-base_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_1b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "tiiuae_falcon3-3b-base_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_3b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "tiiuae_falcon3-7b-base_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_7b",
+        "accuracy-testing": true,
+        "batch-size": 4
+      },
+      {
+        "name": "qwen_2_5_0_5b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_0_5b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "qwen_2_5_1_5b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_1_5b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "qwen_2_5_3b_instruct_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_3b",
+        "accuracy-testing": true,
+        "batch-size": 16
+      },
+      {
+        "name": "qwen_3_0_6b_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_0_6b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "qwen_3_1_7b_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_1_7b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "qwen_3_4b_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_4b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "qwen_3_8b_accuracy",
+        "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_8b",
+        "accuracy-testing": true
+      },
+      {
+        "name": "ministral_8b_accuracy",
+        "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
+        "pytest": "benchmark/tt-xla/test_llms.py::test_ministral_8b",
+        "accuracy-testing": true,
+        "batch-size": 16
       }
     ]
   }
 
@@ -15,6 +15,8 @@ jobs:
     outputs:
       matrix_p150: ${{ steps.set-perf-benchmarks.outputs.matrix_p150 }}
       matrix_p150_skip: ${{ steps.set-perf-benchmarks.outputs.matrix_p150_skip }}
+      matrix_n150_accuracy: ${{ steps.set-perf-benchmarks.outputs.matrix_n150_accuracy }}
+      matrix_n150_accuracy_skip: ${{ steps.set-perf-benchmarks.outputs.matrix_n150_accuracy_skip }}
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
@@ -28,6 +30,7 @@ jobs:
       id: set-perf-benchmarks
       shell: bash
       run: |
+        # Filter for regular p150 tests
         result=$(python .github/workflows/filter-test-matrix.py \
           .github/workflows/perf-bench-matrix.json \
           "tt-forge")
@@ -44,6 +47,25 @@ jobs:
         echo "matrix_p150=$matrix_p150" >> $GITHUB_OUTPUT
         echo "matrix_p150_skip=$matrix_p150_skip" >> $GITHUB_OUTPUT
 
+        # Filter for n150 accuracy tests
+        # Call filter-test-matrix.py with --sh-runner flag to map n150 to shared runners
+        result_sh=$(python .github/workflows/filter-test-matrix.py \
+          .github/workflows/perf-bench-matrix.json \
+          "tt-forge" \
+          --sh-runner)
+
+        # Filter by: runs-on contains "n150" AND accuracy-testing == true
+        matrix_n150_accuracy=$(echo $result_sh | jq -r -c '.matrix | map(select((."runs-on" | contains("n150")) and (.["accuracy-testing"] == true)))')
+
+        matrix_n150_accuracy_skip="false"
+
+        if [ "$matrix_n150_accuracy" == "[]" ]; then
+          matrix_n150_accuracy_skip="true"
+        fi
+
+        echo "matrix_n150_accuracy=$matrix_n150_accuracy" >> $GITHUB_OUTPUT
+        echo "matrix_n150_accuracy_skip=$matrix_n150_accuracy_skip" >> $GITHUB_OUTPUT
+
   run-p150-perf-benchmarks:
     needs: filter-tests
     if: ${{ needs.filter-tests.outputs.matrix_p150_skip == 'false' }}
@@ -53,9 +75,19 @@ jobs:
       matrix: ${{ needs.filter-tests.outputs.matrix_p150 }}
       docker-image: "ghcr.io/tenstorrent/tt-xla-slim:nightly-latest"
 
+  run-n150-accuracy-benchmarks:
+    needs: filter-tests
+    if: ${{ needs.filter-tests.outputs.matrix_n150_accuracy_skip == 'false' }}
+    secrets: inherit
+    uses: ./.github/workflows/call-perf-test.yml
+    with:
+      matrix: ${{ needs.filter-tests.outputs.matrix_n150_accuracy }}
+      docker-image: "ghcr.io/tenstorrent/tt-xla-slim:nightly-latest"
+
   produce-data:
     needs:
       - run-p150-perf-benchmarks
+      - run-n150-accuracy-benchmarks
     if: always()
     runs-on: ubuntu-latest
     steps:
 
@@ -162,6 +162,13 @@ def pytest_addoption(parser):
         type=make_validator_boolean("--experimental-compile"),
         help="Enable experimental compile flag (true/false). Overrides config value.",
     )
+    parser.addoption(
+        "--accuracy-testing",
+        action="store",
+        default=None,
+        type=make_validator_boolean("--accuracy-testing"),
+        help="Enable accuracy testing mode (true/false). Uses reference data for TOP1/TOP5 accuracy.",
+    )
 
 
 @pytest.fixture
@@ -217,3 +224,9 @@ def task(request):
 @pytest.fixture
 def experimental_compile(request):
     return request.config.getoption("--experimental-compile")
+
+
+@pytest.fixture
+def accuracy_testing(request):
+    value = request.config.getoption("--accuracy-testing")
+    return value if value is not None else False