Skip to content

Commit 3559c2a

Browse files
authored
add qnn eval CI (#14528)
As title, add the script to prevent eval script regression. The test will calibrate qwen model and run eval, and test against threshold. If it's above the threshold, the test fail <img width="703" height="432" alt="image" src="https://github.com/user-attachments/assets/ac164198-5c29-4161-8832-8e18a624f317" />
1 parent b3178bf commit 3559c2a

File tree

2 files changed

+127
-0
lines changed

2 files changed

+127
-0
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/bin/bash
2+
# Copyright (c) Qualcomm Innovation Center, Inc.
3+
# All rights reserved
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -euo pipefail
9+
10+
echo ">>> Script invoked with arguments: $@"
11+
12+
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
13+
14+
# Download QNN_SDK. If already downloaded, export environment path
15+
source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
16+
install_qnn
17+
18+
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
19+
export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
20+
export PYTHONPATH=".."
21+
cp schema/program.fbs exir/_serialize/program.fbs
22+
cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
23+
cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
24+
cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
25+
26+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
27+
PYTHON_EXECUTABLE=python3
28+
fi
29+
30+
which "${PYTHON_EXECUTABLE}"
31+
32+
# -------------------------------
33+
# Parse args
34+
# -------------------------------
35+
EXTRA_FLAGS=""
36+
THRESHOLD=62.0 # default fallback
37+
38+
while [[ $# -gt 0 ]]; do
39+
case "$1" in
40+
--flags)
41+
EXTRA_FLAGS="$2"
42+
shift 2
43+
;;
44+
--threshold)
45+
THRESHOLD="$2"
46+
shift 2
47+
;;
48+
*)
49+
echo "Unknown option: $1"
50+
exit 1
51+
;;
52+
esac
53+
done
54+
55+
# Config
56+
PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}"
57+
MODEL="qwen2_5-0_5b"
58+
MAX_SEQ=1024
59+
PTQ="16a4w"
60+
61+
EXTRA_FLAGS="$@"
62+
63+
# Run command and capture *both stdout and stderr*
64+
LOG_FILE="eval_${MODEL}_$(date +%Y%m%d_%H%M%S).log"
65+
66+
echo ">>> Running evaluation with flags: $EXTRA_FLAGS | threshold: $THRESHOLD"
67+
$PYTHON_EXECUTABLE -m executorch.examples.qualcomm.oss_scripts.llama.eval_llama_qnn \
68+
--decoder_model "$MODEL" \
69+
--quant_linear_only \
70+
--max_seq_length "$MAX_SEQ" \
71+
--ptq "$PTQ" \
72+
$EXTRA_FLAGS 2>&1 | tee "$LOG_FILE"
73+
74+
# Extract last word_perplexity
75+
LAST_PERP=$(grep "INFO:root:wikitext:" "$LOG_FILE" | tail -n 1 | sed -E "s/.*'word_perplexity,none': ([0-9.]+).*/\1/")
76+
77+
if [[ -z "$LAST_PERP" ]]; then
78+
echo "❌ Could not find word_perplexity in logs!"
79+
exit 1
80+
fi
81+
82+
echo ">>> Last word_perplexity = $LAST_PERP"
83+
84+
# Compare against threshold
85+
awk -v val="$LAST_PERP" -v thr="$THRESHOLD" 'BEGIN {exit (val > thr)}'
86+
if [[ $? -ne 0 ]]; then
87+
echo "❌ Regression detected: word_perplexity ($LAST_PERP) > threshold ($THRESHOLD)"
88+
exit 1
89+
fi
90+
91+
echo "✅ Check passed: word_perplexity ($LAST_PERP) <= $THRESHOLD"

.github/workflows/trunk.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,42 @@ jobs:
973973
# Test llama2
974974
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
975975
976+
test-static-llama-qnn-eval-linux:
977+
name: test-static-llama-qnn-eval-linux
978+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
979+
permissions:
980+
id-token: write
981+
contents: read
982+
strategy:
983+
fail-fast: false
984+
matrix:
985+
config:
986+
- name: "baseline"
987+
flags: ""
988+
threshold: 62.0
989+
with:
990+
runner: linux.2xlarge
991+
docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk
992+
submodules: 'recursive'
993+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
994+
timeout: 180
995+
script: |
996+
# The generic Linux job chooses to use base env, not the one setup by the image
997+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
998+
conda activate "${CONDA_ENV}"
999+
BUILD_TOOL="cmake"
1000+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
1001+
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
1002+
# Setup executorch
1003+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
1004+
# Setup install_requirements for llama
1005+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
1006+
1007+
echo ">>> Running config: ${{ matrix.config.name }}"
1008+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama_eval.sh \
1009+
--flags "${{ matrix.config.flags }}" \
1010+
--threshold "${{ matrix.config.threshold }}"
1011+
9761012
unittest-release:
9771013
uses: ./.github/workflows/_unittest.yml
9781014
permissions:

0 commit comments

Comments
 (0)