66  pull_request :
77    paths :
88      - .github/workflows/android-perf.yml 
9+       - .ci/scripts/gather_benchmark_configs.py 
910      - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2 
1011  push :
1112    branches :
1213      - main 
1314    paths :
1415      - .github/workflows/android-perf.yml 
16+       - .ci/scripts/gather_benchmark_configs.py 
1517      - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2 
1618  #  Note: GitHub has an upper limit of 10 inputs
1719  workflow_dispatch :
2022        description : Models to be benchmarked 
2123        required : false 
2224        type : string 
23-         default : llama 
25+         default : Qwen/Qwen3-0.6B 
2426      devices :
2527        description : Target devices to run benchmark 
2628        required : false 
3638        description : Models to be benchmarked 
3739        required : false 
3840        type : string 
39-         default : llama 
41+         default : Qwen/Qwen3-0.6B 
4042      devices :
4143        description : Target devices to run benchmark 
4244        required : false 
7072          #  Separate default values from the workflow dispatch. To ensure defaults are accessible
7173          #  during scheduled runs and to provide flexibility for different defaults between
7274          #  on-demand and periodic benchmarking.
73-           CRON_DEFAULT_MODELS : ${{ github.event_name == 'schedule' && 'llama, mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama ' }} 
75+           CRON_DEFAULT_MODELS : ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf ' || 'Qwen/Qwen3-0.6B ' }} 
7476          CRON_DEFAULT_DEVICES : samsung_galaxy_s22 
7577        run : | 
7678          set -eux 
@@ -201,8 +203,8 @@ jobs:
201203            HF_MODEL_REPO=${{ matrix.model }} 
202204            OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}" 
203205
206+             # Convert HF checkpoint to ET via etLLM path 
204207            if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then 
205-                 # Llama models on Hugging Face 
206208                if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then 
207209                    # SpinQuant 
208210                    # Download prequantized chceckpoint from Hugging Face 
@@ -272,6 +274,21 @@ jobs:
272274                      --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ 
273275                      --output_name="${OUT_ET_MODEL_NAME}.pte" 
274276                    ls -lh "${OUT_ET_MODEL_NAME}.pte" 
277+                 elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then 
278+                     DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") 
279+                     python -m examples.models.llama.export_llama \ 
280+                       --model llama3_2 \ 
281+                       --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ 
282+                       --params "${DOWNLOADED_PATH}/params.json" \ 
283+                       -kv \ 
284+                       --use_sdpa_with_kv_cache \ 
285+                       -d fp32 \ 
286+                       -X \ 
287+                       --xnnpack-extended-ops \ 
288+                       -qmode 8da4w -G 32 -E 8,0 \ 
289+                       --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ 
290+                       --output_name="${OUT_ET_MODEL_NAME}.pte" 
291+                     ls -lh "${OUT_ET_MODEL_NAME}.pte" 
275292                elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then 
276293                    export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 
277294                    export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/ 
@@ -292,21 +309,75 @@ jobs:
292309                    OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script 
293310                    find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \; 
294311                    ls -lh "${OUT_ET_MODEL_NAME}.pte" 
295-                 else 
296-                     # By default, test with the Hugging Face model and the xnnpack recipe 
297-                     DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model") 
298-                     python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME" 
299-                     ls -lh "${OUT_ET_MODEL_NAME}.pte" 
300312                fi 
301-             else 
302-                 echo "Unsupported model ${{ matrix.model }}" 
303-                 exit 1 
313+             elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then 
314+               if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then 
315+                 DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") 
316+                 python -m examples.models.llama.export_llama \ 
317+                   --model qwen3-0_6b \ 
318+                   --params examples/models/qwen3/0_6b_config.json \ 
319+                   -kv \ 
320+                   --use_sdpa_with_kv_cache \ 
321+                   -d fp32 \ 
322+                   -X \ 
323+                   --xnnpack-extended-ops \ 
324+                   -qmode 8da4w \ 
325+                   -G 32 \ 
326+                   -E 8,0 \ 
327+                   --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ 
328+                   --output_name="${OUT_ET_MODEL_NAME}.pte" 
329+                 ls -lh "${OUT_ET_MODEL_NAME}.pte" 
330+               fi 
331+             fi 
332+ 
333+             if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then 
334+               DOWNLOADED_PATH=$( 
335+                 bash .ci/scripts/download_hf_hub.sh \ 
336+                   --model_id "${HF_MODEL_REPO}" \ 
337+                   --files "tokenizer.json" 
338+               ) 
339+               echo "tokenizer.json is downloaded to $DOWNLOADED_PATH" 
340+ 
341+               # Install optimum-executorch 
342+               git clone https://github.com/huggingface/optimum-executorch 
343+               pushd optimum-executorch 
344+               # There is no release yet, for CI stability, always test from the same commit on main 
345+               git checkout 1c653dc49812fc431a22312c7295d97005d22e12 
346+               python install_dev.py 
347+               pip list 
348+ 
349+               ARGS=( 
350+                 "--model" "${HF_MODEL_REPO}" 
351+                 "--task" "text-generation" 
352+                 "--recipe" "xnnpack" 
353+                 "--use_custom_sdpa" 
354+                 "--qlinear" 
355+                 "--qembedding" 
356+                 "--output_dir" ".." 
357+               ) 
358+ 
359+               # Add conditional arguments based on model 
360+               case "${HF_MODEL_REPO}" in 
361+                 *"google/gemma-3-1b-it"*) 
362+                   echo "--use_custom_kv_cache can not be used for HybridCache" 
363+                   ;; 
364+                 *) 
365+                   ARGS+=("--use_custom_kv_cache") 
366+                   ;; 
367+               esac 
368+ 
369+               optimum-cli export executorch "${ARGS[@]}" 
370+               popd 
371+ 
372+               mv model.pte ${OUT_ET_MODEL_NAME}.pte 
373+               ls -lh "${OUT_ET_MODEL_NAME}.pte" 
304374            fi 
305375
306-             zip -j model.zip " ${OUT_ET_MODEL_NAME}.pte" " ${DOWNLOADED_PATH}/tokenizer.model"  
376+             zip -j model.zip ${OUT_ET_MODEL_NAME}.pte  ${DOWNLOADED_PATH}/tokenizer.*  
307377            ls -lh model.zip 
308-             mkdir -p "${ARTIFACTS_DIR_NAME}" 
309-             mv model.zip "${ARTIFACTS_DIR_NAME}" 
378+             mkdir -p ${ARTIFACTS_DIR_NAME} 
379+             mv model.zip ${ARTIFACTS_DIR_NAME} 
380+             ls -lh ${ARTIFACTS_DIR_NAME} 
310381        elif [[ ${{ matrix.model }} == "llama" ]]; then 
311382            # Install requirements for export_llama 
312383            PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh 
0 commit comments