Create a script for qnn static llama

winskuo-quic · winskuo-quic · commit 284e95b1a418 · 2025-02-10T10:04:16.000+08:00
diff --git a/.ci/scripts/test_qnn_static_llama.sh b/.ci/scripts/test_qnn_static_llama.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+
+# Download and create artifacts.
+PARAMS="params.json"
+CHECKPOINT_FILE_NAME=""
+touch "${PARAMS}"
+if [[ "${MODEL_NAME}" == "llama" ]] || [[ "${MODEL_NAME}" == "stories"* ]] || [[ "${MODEL_NAME}" == "tinyllama" ]]; then
+  CHECKPOINT_FILE_NAME="stories110M.pt"
+  download_stories_model_artifacts
+else
+  echo "Unsupported model name ${MODEL_NAME}"
+  exit 1
+fi
+
+# Create tokenizer.bin.
+echo "Creating tokenizer.bin"
+$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+
+export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
+  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
+  export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
+  export PYTHONPATH=".."
+  cp schema/program.fbs exir/_serialize/program.fbs
+  cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
+  cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
+  cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
+
+
+# Compile only as weight sharing is not applicable on x86
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
+exit_code1=$?
+
+# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
+exit_code2=$?
+
+# Check the exit codes and print messages
+if [ $exit_code1 -ne 0 ]; then
+    echo "Static Llama Compile only weight sharing test failed. $exit_code1."
+fi
+
+if [ $exit_code2 -ne 0 ]; then
+    echo "Static Llama accuracy test failed. $exit_code2."
+fi
+
+# Return failure if either program failed
+if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
+    exit 1
+else
+    exit 0
+fi
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -440,9 +440,8 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
 
-  # Compile only as weight sharing is not applicable on x86
-  test-static-llama-size-qnn-linux:
-    name: test-static-llama-size-qnn-linux
+  test-static-llama-qnn-linux:
+    name: test-static-llama-qnn-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -470,49 +469,9 @@ jobs:
 
         # Setup install_requirements for llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
-
-        # Retrieve 110M Stories Llama Artifacts
-        PYTHON_EXECUTABLE=python . .ci/scripts/utils.sh
-        PYTHON_EXECUTABLE=python download_stories_model_artifacts
-        PYTHONPATH="${PWD}" python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
-        
-        # Test static llama stories110m pte size
-        PYTHONPATH="${PWD}" python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
-
-  # Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
-  test-static-llama-accuracy-qnn-linux:
-    name: test-static-llama-accuracy-qnn-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-qnn-sdk
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 900
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        BUILD_TOOL="cmake"
-
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
-        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
-
-        # Setup executorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
-
-        # Setup install_requirements for llama
-        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
-
-        # Retrieve 110M Stories Llama Artifacts
-        PYTHON_EXECUTABLE=python . .ci/scripts/utils.sh
-        PYTHON_EXECUTABLE=python download_stories_model_artifacts
         
-        # Test static llama stories110m accuracy
-        PYTHONPATH="${PWD}" python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86_64/ --executorch_root . --artifact_dir . --enable_x86_64
+        # Test static llama weight sharing and accuracy
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh
 
   test-qnn-models-linux:
     name: test-qnn-models-linux