- Pass --accuracy-testing flag to pytest for accuracy tests

dgolubovicTT · dgolubovicTT · commit cd3a018e9c20 · 2026-02-11T13:31:24.000Z
- Call filter-test-matrix.py with --sh-runner flag for n150 tests
- Filter by runs-on=n150 AND accuracy-testing=true using jq
- Add run-n150-accuracy-benchmarks job that runs on shared runners
diff --git a/.github/workflows/call-perf-test.yml b/.github/workflows/call-perf-test.yml
@@ -197,7 +197,7 @@ jobs:
           python benchmark/benchmark.py -p ${{ matrix.build.project}} -m ${{ matrix.build.name }} -bs ${{ matrix.build.bs }} -df ${{ matrix.build.df }} -lp ${{ matrix.build.lp }} ${{ matrix.build.input_sequence_length && format('-isl {0}', matrix.build.input_sequence_length) }} -ts ${{ matrix.build.ts }} -o ${{ steps.strings.outputs.perf_report_json_file }} ${{ inputs.run_id_source && format('-r {0}', inputs.run_id_source) }}
         else
           # Run with pytest
-          pytest -svv "${{ matrix.build.pytest }}" --output-file=${{ steps.strings.outputs.perf_report_json_file }}
+          pytest -svv "${{ matrix.build.pytest }}" ${{ matrix.build.accuracy-testing && '--accuracy-testing true' || '' }} --output-file=${{ steps.strings.outputs.perf_report_json_file }}
         fi
 
     - name: Dump stablehlo to report
diff --git a/.github/workflows/perf-bench-matrix.json b/.github/workflows/perf-bench-matrix.json
@@ -309,107 +309,128 @@
       {
         "name": "llama_3_2_1b_instruct_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_1b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_1b",
+        "accuracy-testing": true
       },
       {
         "name": "llama_3_2_3b_instruct_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_3b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_3b",
+        "accuracy-testing": true
       },
       {
         "name": "llama_3_1_8b_instruct_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_1_8b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_1_8b",
+        "accuracy-testing": true
       },
       {
         "name": "mistral_7b_accuracy",
         "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1 protobuf sentencepiece",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_mistral_7b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_mistral_7b",
+        "accuracy-testing": true
       },
       {
         "name": "qwen_2_5_7b_instruct_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_7b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_7b",
+        "accuracy-testing": true
       },
       {
         "name": "google_gemma-1.1-2b-it_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_gemma_1_1_2b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_gemma_1_1_2b",
+        "accuracy-testing": true
       },
       {
         "name": "google_gemma-2-2b-it_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_gemma_2_2b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_gemma_2_2b",
+        "accuracy-testing": true
       },
       {
         "name": "microsoft_phi-1_accuracy",
         "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_phi1_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_phi1",
+        "accuracy-testing": true
       },
       {
         "name": "microsoft_phi-1_5_accuracy",
         "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_phi1_5_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_phi1_5",
+        "accuracy-testing": true
       },
       {
         "name": "microsoft_phi-2_accuracy",
         "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_phi2_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_phi2",
+        "accuracy-testing": true
       },
       {
         "name": "tiiuae_falcon3-1b-base_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_1b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_1b",
+        "accuracy-testing": true
       },
       {
         "name": "tiiuae_falcon3-3b-base_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_3b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_3b",
+        "accuracy-testing": true
       },
       {
         "name": "tiiuae_falcon3-7b-base_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_7b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_7b",
+        "accuracy-testing": true
       },
       {
         "name": "qwen_2_5_0_5b_instruct_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_0_5b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_0_5b",
+        "accuracy-testing": true
       },
       {
         "name": "qwen_2_5_1_5b_instruct_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_1_5b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_1_5b",
+        "accuracy-testing": true
       },
       {
         "name": "qwen_2_5_3b_instruct_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_3b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_3b",
+        "accuracy-testing": true
       },
       {
         "name": "qwen_3_0_6b_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_0_6b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_0_6b",
+        "accuracy-testing": true
       },
       {
         "name": "qwen_3_1_7b_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_1_7b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_1_7b",
+        "accuracy-testing": true
       },
       {
         "name": "qwen_3_4b_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_4b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_4b",
+        "accuracy-testing": true
       },
       {
         "name": "qwen_3_8b_accuracy",
         "pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_8b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_8b",
+        "accuracy-testing": true
       },
       {
         "name": "ministral_8b_accuracy",
         "pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
-        "pytest": "benchmark/tt-xla/test_llms.py::test_ministral_8b_accuracy"
+        "pytest": "benchmark/tt-xla/test_llms.py::test_ministral_8b",
+        "accuracy-testing": true
       }
     ]
   }
diff --git a/.github/workflows/perf-benchmark-experimental.yml b/.github/workflows/perf-benchmark-experimental.yml
@@ -15,6 +15,8 @@ jobs:
     outputs:
       matrix_p150: ${{ steps.set-perf-benchmarks.outputs.matrix_p150 }}
       matrix_p150_skip: ${{ steps.set-perf-benchmarks.outputs.matrix_p150_skip }}
+      matrix_n150_accuracy: ${{ steps.set-perf-benchmarks.outputs.matrix_n150_accuracy }}
+      matrix_n150_accuracy_skip: ${{ steps.set-perf-benchmarks.outputs.matrix_n150_accuracy_skip }}
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
@@ -28,6 +30,7 @@ jobs:
       id: set-perf-benchmarks
       shell: bash
       run: |
+        # Filter for regular p150 tests
         result=$(python .github/workflows/filter-test-matrix.py \
           .github/workflows/perf-bench-matrix.json \
           "tt-forge")
@@ -44,6 +47,25 @@ jobs:
         echo "matrix_p150=$matrix_p150" >> $GITHUB_OUTPUT
         echo "matrix_p150_skip=$matrix_p150_skip" >> $GITHUB_OUTPUT
 
+        # Filter for n150 accuracy tests
+        # Call filter-test-matrix.py with --sh-runner flag to map n150 to shared runners
+        result_sh=$(python .github/workflows/filter-test-matrix.py \
+          .github/workflows/perf-bench-matrix.json \
+          "tt-forge" \
+          --sh-runner)
+
+        # Filter by: runs-on contains "n150" AND accuracy-testing == true
+        matrix_n150_accuracy=$(echo $result_sh | jq -r -c '.matrix | map(select((."runs-on" | contains("n150")) and (.["accuracy-testing"] == true)))')
+
+        matrix_n150_accuracy_skip="false"
+
+        if [ "$matrix_n150_accuracy" == "[]" ]; then
+          matrix_n150_accuracy_skip="true"
+        fi
+
+        echo "matrix_n150_accuracy=$matrix_n150_accuracy" >> $GITHUB_OUTPUT
+        echo "matrix_n150_accuracy_skip=$matrix_n150_accuracy_skip" >> $GITHUB_OUTPUT
+
   run-p150-perf-benchmarks:
     needs: filter-tests
     if: ${{ needs.filter-tests.outputs.matrix_p150_skip == 'false' }}
@@ -53,9 +75,19 @@ jobs:
       matrix: ${{ needs.filter-tests.outputs.matrix_p150 }}
       docker-image: "ghcr.io/tenstorrent/tt-xla-slim:nightly-latest"
 
+  run-n150-accuracy-benchmarks:
+    needs: filter-tests
+    if: ${{ needs.filter-tests.outputs.matrix_n150_accuracy_skip == 'false' }}
+    secrets: inherit
+    uses: ./.github/workflows/call-perf-test.yml
+    with:
+      matrix: ${{ needs.filter-tests.outputs.matrix_n150_accuracy }}
+      docker-image: "ghcr.io/tenstorrent/tt-xla-slim:nightly-latest"
+
   produce-data:
     needs:
       - run-p150-perf-benchmarks
+      - run-n150-accuracy-benchmarks
     if: always()
     runs-on: ubuntu-latest
     steps: