66 pull_request :
77 paths :
88 - .github/workflows/android-perf.yml
9+ - .ci/scripts/gather_benchmark_configs.py
910 - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
1011 push :
1112 branches :
1213 - main
1314 paths :
1415 - .github/workflows/android-perf.yml
16+ - .ci/scripts/gather_benchmark_configs.py
1517 - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
1618 # Note: GitHub has an upper limit of 10 inputs
1719 workflow_dispatch :
7072 # Separate default values from the workflow dispatch. To ensure defaults are accessible
7173 # during scheduled runs and to provide flexibility for different defaults between
7274 # on-demand and periodic benchmarking.
73- CRON_DEFAULT_MODELS : ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
75+ CRON_DEFAULT_MODELS : ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf ' || 'llama' }}
7476 CRON_DEFAULT_DEVICES : samsung_galaxy_s22
7577 run : |
7678 set -eux
@@ -201,8 +203,8 @@ jobs:
201203 HF_MODEL_REPO=${{ matrix.model }}
202204 OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
203205
206+ # Convert HF checkpoint to ET via etLLM path
204207 if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
205- # Llama models on Hugging Face
206208 if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
207209 # SpinQuant
208210 # Download prequantized chceckpoint from Hugging Face
@@ -272,6 +274,21 @@ jobs:
272274 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
273275 --output_name="${OUT_ET_MODEL_NAME}.pte"
274276 ls -lh "${OUT_ET_MODEL_NAME}.pte"
277+ elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
278+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
279+ python -m examples.models.llama.export_llama \
280+ --model llama3_2 \
281+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
282+ --params "${DOWNLOADED_PATH}/params.json" \
283+ -kv \
284+ --use_sdpa_with_kv_cache \
285+ -d fp32 \
286+ -X \
287+ --xnnpack-extended-ops \
288+ -qmode 8da4w -G 32 -E 8,0 \
289+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
290+ --output_name="${OUT_ET_MODEL_NAME}.pte"
291+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
275292 elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
276293 export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
277294 export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
@@ -292,21 +309,75 @@ jobs:
292309 OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
293310 find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
294311 ls -lh "${OUT_ET_MODEL_NAME}.pte"
295- else
296- # By default, test with the Hugging Face model and the xnnpack recipe
297- DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
298- python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
299- ls -lh "${OUT_ET_MODEL_NAME}.pte"
300312 fi
301- else
302- echo "Unsupported model ${{ matrix.model }}"
303- exit 1
313+ elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
314+ if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
315+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
316+ python -m examples.models.llama.export_llama \
317+ --model qwen3-0_6b \
318+ --params examples/models/qwen3/0_6b_config.json \
319+ -kv \
320+ --use_sdpa_with_kv_cache \
321+ -d fp32 \
322+ -X \
323+ --xnnpack-extended-ops \
324+ -qmode 8da4w \
325+ -G 32 \
326+ -E 8,0 \
327+ --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
328+ --output_name="${OUT_ET_MODEL_NAME}.pte"
329+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
330+ fi
331+ fi
332+
333+ if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
334+ DOWNLOADED_PATH=$(
335+ bash .ci/scripts/download_hf_hub.sh \
336+ --model_id "${HF_MODEL_REPO}" \
337+ --files "tokenizer.json"
338+ )
339+ echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
340+
341+ # Install optimum-executorch
342+ git clone https://github.com/huggingface/optimum-executorch
343+ pushd optimum-executorch
344+ # There is no release yet, for CI stability, always test from the same commit on main
345+ git checkout 1c653dc49812fc431a22312c7295d97005d22e12
346+ python install_dev.py
347+ pip list
348+
349+ ARGS=(
350+ "--model" "${HF_MODEL_REPO}"
351+ "--task" "text-generation"
352+ "--recipe" "xnnpack"
353+ "--use_custom_sdpa"
354+ "--qlinear"
355+ "--qembedding"
356+ "--output_dir" ".."
357+ )
358+
359+ # Add conditional arguments based on model
360+ case "${HF_MODEL_REPO}" in
361+ *"google/gemma-3-1b-it"*)
362+ echo "--use_custom_kv_cache can not be used for HybridCache"
363+ ;;
364+ *)
365+ ARGS+=("--use_custom_kv_cache")
366+ ;;
367+ esac
368+
369+ optimum-cli export executorch "${ARGS[@]}"
370+ popd
371+
372+ mv model.pte ${OUT_ET_MODEL_NAME}.pte
373+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
304374 fi
305375
306- zip -j model.zip " ${OUT_ET_MODEL_NAME}.pte" " ${DOWNLOADED_PATH}/tokenizer.model"
376+ zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
307377 ls -lh model.zip
308- mkdir -p "${ARTIFACTS_DIR_NAME}"
309- mv model.zip "${ARTIFACTS_DIR_NAME}"
378+ mkdir -p ${ARTIFACTS_DIR_NAME}
379+ mv model.zip ${ARTIFACTS_DIR_NAME}
380+ ls -lh ${ARTIFACTS_DIR_NAME}
310381 elif [[ ${{ matrix.model }} == "llama" ]]; then
311382 # Install requirements for export_llama
312383 PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
0 commit comments