From 2c5f9dcc77a1526bd0ece4128fc1df88ea9151b5 Mon Sep 17 00:00:00 2001 From: Github Executorch Date: Wed, 18 Dec 2024 19:16:39 -0800 Subject: [PATCH] Add Llama3.1 1B HTP to benchmark --- .ci/scripts/gather_benchmark_configs.py | 3 +-- .github/workflows/android-perf.yml | 15 ++++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py index a9271537786..ef77d179ab3 100755 --- a/.ci/scripts/gather_benchmark_configs.py +++ b/.ci/scripts/gather_benchmark_configs.py @@ -34,8 +34,7 @@ ], "android": [ "qnn_q8", - # TODO: Add support for llama3 htp - # "llama3_qnn_htp", + "llama3_qnn_htp", ], "ios": [ "coreml_fp16", diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 6cf583a5992..473e9da1bba 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -132,10 +132,10 @@ jobs: matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} fail-fast: false with: - runner: linux.2xlarge.memory + runner: linux.4xlarge.memory docker-image: executorch-ubuntu-22.04-qnn-sdk submodules: 'true' - timeout: 60 + timeout: 240 upload-artifact: android-models upload-artifact-to-s3: true secrets-env: EXECUTORCH_HF_TOKEN @@ -238,12 +238,17 @@ jobs: --output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 + echo "QNN_SDK_ROOT=${QNN_SDK_ROOT}" export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/ + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" export PYTHONPATH=$(pwd)/.. + echo "PYTHONPATH=${PYTHONPATH}" + python -c "import sys; sys.stdout.flush()" + sync # Ensures any buffered disk writes are committed - DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - python -m examples.qualcomm.oss_scripts.llama3_2.llama -- \ + python -m examples.qualcomm.oss_scripts.llama3_2.llama \ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ --params "${DOWNLOADED_PATH}/params.json" \ --tokenizer_model "${DOWNLOADED_PATH}/tokenizer.model" \ @@ -252,10 +257,10 @@ jobs: -m SM8650 \ --model_size 1B \ --model_mode kv \ + -b "cmake-out" \ --prompt "Once" OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script - find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \; ls -lh "${OUT_ET_MODEL_NAME}.pte" else # By default, test with the Hugging Face model and the xnnpack recipe