add qnn eval script

cccclai · cccclai · commit b8a930de4203 · 2025-09-23T13:12:30.000-07:00
diff --git a/.ci/scripts/test_qnn_static_llama_eval.sh b/.ci/scripts/test_qnn_static_llama_eval.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euo pipefail
+
+# Config
+PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}"
+MODEL="qwen2_5-0_5b"
+MAX_SEQ=1024
+PTQ="16a4w"
+THRESHOLD=62.0   # regression guardrail
+
+# Run command and capture log
+LOG_FILE="eval_${MODEL}_$(date +%Y%m%d_%H%M%S).log"
+
+echo ">>> Running evaluation..."
+$PYTHON_EXECUTABLE -m executorch.examples.qualcomm.oss_scripts.llama.eval_llama_qnn \
+  --decoder_model "$MODEL" \
+  --quant_linear_only \
+  --max_seq_length "$MAX_SEQ" \
+  --ptq "$PTQ" | tee "$LOG_FILE"
+
+# Extract last word_perplexity
+LAST_PERP=$(grep "INFO:root:wikitext:" "$LOG_FILE" | tail -n 1 | sed -E "s/.*'word_perplexity,none': ([0-9.]+).*/\1/")
+
+if [[ -z "$LAST_PERP" ]]; then
+  echo "❌ Could not find word_perplexity in logs!"
+  exit 1
+fi
+
+echo ">>> Last word_perplexity = $LAST_PERP"
+
+# Compare against threshold
+awk -v val="$LAST_PERP" -v thr="$THRESHOLD" 'BEGIN {exit (val > thr)}'
+if [[ $? -ne 0 ]]; then
+  echo "❌ Regression detected: word_perplexity ($LAST_PERP) > threshold ($THRESHOLD)"
+  exit 1
+fi
+
+echo "✅ Check passed: word_perplexity ($LAST_PERP) <= $THRESHOLD"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -583,6 +583,41 @@ jobs:
         # Test static llama weight sharing and accuracy
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh
 
+
+  test-static-llama-qnn-eval-linux:
+    name: test-static-llama-qnn-eval-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 180
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        BUILD_TOOL="cmake"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+
+        # Setup executorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
+
+        # Setup install_requirements for llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+
+        # Test static llama weight sharing and accuracy
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama_eval.sh
+
+
   test-qnn-models-linux:
     name: test-qnn-models-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main