From 2c5f9dcc77a1526bd0ece4128fc1df88ea9151b5 Mon Sep 17 00:00:00 2001
From: Github Executorch <github_executorch@arm.com>
Date: Wed, 18 Dec 2024 19:16:39 -0800
Subject: [PATCH] Add Llama3.1 1B HTP to benchmark

---
 .ci/scripts/gather_benchmark_configs.py |  3 +--
 .github/workflows/android-perf.yml      | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py
index a9271537786..ef77d179ab3 100755
--- a/.ci/scripts/gather_benchmark_configs.py
+++ b/.ci/scripts/gather_benchmark_configs.py
@@ -34,8 +34,7 @@
     ],
     "android": [
         "qnn_q8",
-        # TODO: Add support for llama3 htp
-        # "llama3_qnn_htp",
+        "llama3_qnn_htp",
     ],
     "ios": [
         "coreml_fp16",
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 6cf583a5992..473e9da1bba 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -132,10 +132,10 @@ jobs:
       matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
       fail-fast: false
     with:
-      runner: linux.2xlarge.memory
+      runner: linux.4xlarge.memory
       docker-image: executorch-ubuntu-22.04-qnn-sdk
       submodules: 'true'
-      timeout: 60
+      timeout: 240
       upload-artifact: android-models
       upload-artifact-to-s3: true
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -238,12 +238,17 @@ jobs:
                       --output_name="${OUT_ET_MODEL_NAME}.pte"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
+                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
                     export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
+                    echo "QNN_SDK_ROOT=${QNN_SDK_ROOT}"
                     export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
+                    echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}"
                     export PYTHONPATH=$(pwd)/..
+                    echo "PYTHONPATH=${PYTHONPATH}"
+                    python -c "import sys; sys.stdout.flush()"
+                    sync  # Ensures any buffered disk writes are committed
 
-                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
-                    python -m examples.qualcomm.oss_scripts.llama3_2.llama -- \
+                    python -m examples.qualcomm.oss_scripts.llama3_2.llama \
                       --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
                       --params "${DOWNLOADED_PATH}/params.json" \
                       --tokenizer_model "${DOWNLOADED_PATH}/tokenizer.model" \
@@ -252,10 +257,10 @@ jobs:
                       -m SM8650 \
                       --model_size 1B \
                       --model_mode kv \
+                      -b "cmake-out" \
                       --prompt "Once"
 
                     OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
-                    find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 else
                     # By default, test with the Hugging Face model and the xnnpack recipe