Skip to content

Commit cd3a018

Browse files
committed
- Pass --accuracy-testing flag to pytest for accuracy tests
- Call filter-test-matrix.py with --sh-runner flag for n150 tests - Filter by runs-on=n150 AND accuracy-testing=true using jq - Add run-n150-accuracy-benchmarks job that runs on shared runners
1 parent 3835256 commit cd3a018

File tree

3 files changed

+75
-22
lines changed

3 files changed

+75
-22
lines changed

.github/workflows/call-perf-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ jobs:
197197
python benchmark/benchmark.py -p ${{ matrix.build.project}} -m ${{ matrix.build.name }} -bs ${{ matrix.build.bs }} -df ${{ matrix.build.df }} -lp ${{ matrix.build.lp }} ${{ matrix.build.input_sequence_length && format('-isl {0}', matrix.build.input_sequence_length) }} -ts ${{ matrix.build.ts }} -o ${{ steps.strings.outputs.perf_report_json_file }} ${{ inputs.run_id_source && format('-r {0}', inputs.run_id_source) }}
198198
else
199199
# Run with pytest
200-
pytest -svv "${{ matrix.build.pytest }}" --output-file=${{ steps.strings.outputs.perf_report_json_file }}
200+
pytest -svv "${{ matrix.build.pytest }}" ${{ matrix.build.accuracy-testing && '--accuracy-testing true' || '' }} --output-file=${{ steps.strings.outputs.perf_report_json_file }}
201201
fi
202202
203203
- name: Dump stablehlo to report

.github/workflows/perf-bench-matrix.json

Lines changed: 42 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -309,107 +309,128 @@
309309
{
310310
"name": "llama_3_2_1b_instruct_accuracy",
311311
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
312-
"pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_1b_accuracy"
312+
"pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_1b",
313+
"accuracy-testing": true
313314
},
314315
{
315316
"name": "llama_3_2_3b_instruct_accuracy",
316317
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
317-
"pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_3b_accuracy"
318+
"pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_2_3b",
319+
"accuracy-testing": true
318320
},
319321
{
320322
"name": "llama_3_1_8b_instruct_accuracy",
321323
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
322-
"pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_1_8b_accuracy"
324+
"pytest": "benchmark/tt-xla/test_llms.py::test_llama_3_1_8b",
325+
"accuracy-testing": true
323326
},
324327
{
325328
"name": "mistral_7b_accuracy",
326329
"pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1 protobuf sentencepiece",
327-
"pytest": "benchmark/tt-xla/test_llms.py::test_mistral_7b_accuracy"
330+
"pytest": "benchmark/tt-xla/test_llms.py::test_mistral_7b",
331+
"accuracy-testing": true
328332
},
329333
{
330334
"name": "qwen_2_5_7b_instruct_accuracy",
331335
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
332-
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_7b_accuracy"
336+
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_7b",
337+
"accuracy-testing": true
333338
},
334339
{
335340
"name": "google_gemma-1.1-2b-it_accuracy",
336341
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
337-
"pytest": "benchmark/tt-xla/test_llms.py::test_gemma_1_1_2b_accuracy"
342+
"pytest": "benchmark/tt-xla/test_llms.py::test_gemma_1_1_2b",
343+
"accuracy-testing": true
338344
},
339345
{
340346
"name": "google_gemma-2-2b-it_accuracy",
341347
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 tqdm transformers==4.57.1",
342-
"pytest": "benchmark/tt-xla/test_llms.py::test_gemma_2_2b_accuracy"
348+
"pytest": "benchmark/tt-xla/test_llms.py::test_gemma_2_2b",
349+
"accuracy-testing": true
343350
},
344351
{
345352
"name": "microsoft_phi-1_accuracy",
346353
"pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
347-
"pytest": "benchmark/tt-xla/test_llms.py::test_phi1_accuracy"
354+
"pytest": "benchmark/tt-xla/test_llms.py::test_phi1",
355+
"accuracy-testing": true
348356
},
349357
{
350358
"name": "microsoft_phi-1_5_accuracy",
351359
"pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
352-
"pytest": "benchmark/tt-xla/test_llms.py::test_phi1_5_accuracy"
360+
"pytest": "benchmark/tt-xla/test_llms.py::test_phi1_5",
361+
"accuracy-testing": true
353362
},
354363
{
355364
"name": "microsoft_phi-2_accuracy",
356365
"pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
357-
"pytest": "benchmark/tt-xla/test_llms.py::test_phi2_accuracy"
366+
"pytest": "benchmark/tt-xla/test_llms.py::test_phi2",
367+
"accuracy-testing": true
358368
},
359369
{
360370
"name": "tiiuae_falcon3-1b-base_accuracy",
361371
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
362-
"pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_1b_accuracy"
372+
"pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_1b",
373+
"accuracy-testing": true
363374
},
364375
{
365376
"name": "tiiuae_falcon3-3b-base_accuracy",
366377
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
367-
"pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_3b_accuracy"
378+
"pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_3b",
379+
"accuracy-testing": true
368380
},
369381
{
370382
"name": "tiiuae_falcon3-7b-base_accuracy",
371383
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
372-
"pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_7b_accuracy"
384+
"pytest": "benchmark/tt-xla/test_llms.py::test_falcon3_7b",
385+
"accuracy-testing": true
373386
},
374387
{
375388
"name": "qwen_2_5_0_5b_instruct_accuracy",
376389
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
377-
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_0_5b_accuracy"
390+
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_0_5b",
391+
"accuracy-testing": true
378392
},
379393
{
380394
"name": "qwen_2_5_1_5b_instruct_accuracy",
381395
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
382-
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_1_5b_accuracy"
396+
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_1_5b",
397+
"accuracy-testing": true
383398
},
384399
{
385400
"name": "qwen_2_5_3b_instruct_accuracy",
386401
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
387-
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_3b_accuracy"
402+
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_2_5_3b",
403+
"accuracy-testing": true
388404
},
389405
{
390406
"name": "qwen_3_0_6b_accuracy",
391407
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
392-
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_0_6b_accuracy"
408+
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_0_6b",
409+
"accuracy-testing": true
393410
},
394411
{
395412
"name": "qwen_3_1_7b_accuracy",
396413
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
397-
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_1_7b_accuracy"
414+
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_1_7b",
415+
"accuracy-testing": true
398416
},
399417
{
400418
"name": "qwen_3_4b_accuracy",
401419
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
402-
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_4b_accuracy"
420+
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_4b",
421+
"accuracy-testing": true
403422
},
404423
{
405424
"name": "qwen_3_8b_accuracy",
406425
"pyreq": "datasets loguru pytest requests tabulate timm torch==2.9.0 torchvision==0.24.0 tqdm transformers==4.57.1",
407-
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_8b_accuracy"
426+
"pytest": "benchmark/tt-xla/test_llms.py::test_qwen_3_8b",
427+
"accuracy-testing": true
408428
},
409429
{
410430
"name": "ministral_8b_accuracy",
411431
"pyreq": "datasets loguru pytest requests torch==2.9.0 tqdm transformers==4.57.1",
412-
"pytest": "benchmark/tt-xla/test_llms.py::test_ministral_8b_accuracy"
432+
"pytest": "benchmark/tt-xla/test_llms.py::test_ministral_8b",
433+
"accuracy-testing": true
413434
}
414435
]
415436
}

.github/workflows/perf-benchmark-experimental.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ jobs:
1515
outputs:
1616
matrix_p150: ${{ steps.set-perf-benchmarks.outputs.matrix_p150 }}
1717
matrix_p150_skip: ${{ steps.set-perf-benchmarks.outputs.matrix_p150_skip }}
18+
matrix_n150_accuracy: ${{ steps.set-perf-benchmarks.outputs.matrix_n150_accuracy }}
19+
matrix_n150_accuracy_skip: ${{ steps.set-perf-benchmarks.outputs.matrix_n150_accuracy_skip }}
1820
steps:
1921
- name: Checkout repository
2022
uses: actions/checkout@v4
@@ -28,6 +30,7 @@ jobs:
2830
id: set-perf-benchmarks
2931
shell: bash
3032
run: |
33+
# Filter for regular p150 tests
3134
result=$(python .github/workflows/filter-test-matrix.py \
3235
.github/workflows/perf-bench-matrix.json \
3336
"tt-forge")
@@ -44,6 +47,25 @@ jobs:
4447
echo "matrix_p150=$matrix_p150" >> $GITHUB_OUTPUT
4548
echo "matrix_p150_skip=$matrix_p150_skip" >> $GITHUB_OUTPUT
4649
50+
# Filter for n150 accuracy tests
51+
# Call filter-test-matrix.py with --sh-runner flag to map n150 to shared runners
52+
result_sh=$(python .github/workflows/filter-test-matrix.py \
53+
.github/workflows/perf-bench-matrix.json \
54+
"tt-forge" \
55+
--sh-runner)
56+
57+
# Filter by: runs-on contains "n150" AND accuracy-testing == true
58+
matrix_n150_accuracy=$(echo $result_sh | jq -r -c '.matrix | map(select((."runs-on" | contains("n150")) and (.["accuracy-testing"] == true)))')
59+
60+
matrix_n150_accuracy_skip="false"
61+
62+
if [ "$matrix_n150_accuracy" == "[]" ]; then
63+
matrix_n150_accuracy_skip="true"
64+
fi
65+
66+
echo "matrix_n150_accuracy=$matrix_n150_accuracy" >> $GITHUB_OUTPUT
67+
echo "matrix_n150_accuracy_skip=$matrix_n150_accuracy_skip" >> $GITHUB_OUTPUT
68+
4769
run-p150-perf-benchmarks:
4870
needs: filter-tests
4971
if: ${{ needs.filter-tests.outputs.matrix_p150_skip == 'false' }}
@@ -53,9 +75,19 @@ jobs:
5375
matrix: ${{ needs.filter-tests.outputs.matrix_p150 }}
5476
docker-image: "ghcr.io/tenstorrent/tt-xla-slim:nightly-latest"
5577

78+
run-n150-accuracy-benchmarks:
79+
needs: filter-tests
80+
if: ${{ needs.filter-tests.outputs.matrix_n150_accuracy_skip == 'false' }}
81+
secrets: inherit
82+
uses: ./.github/workflows/call-perf-test.yml
83+
with:
84+
matrix: ${{ needs.filter-tests.outputs.matrix_n150_accuracy }}
85+
docker-image: "ghcr.io/tenstorrent/tt-xla-slim:nightly-latest"
86+
5687
produce-data:
5788
needs:
5889
- run-p150-perf-benchmarks
90+
- run-n150-accuracy-benchmarks
5991
if: always()
6092
runs-on: ubuntu-latest
6193
steps:

0 commit comments

Comments
 (0)