7070 # Separate default values from the workflow dispatch. To ensure defaults are accessible
7171 # during scheduled runs and to provide flexibility for different defaults between
7272 # on-demand and periodic benchmarking.
73- CRON_DEFAULT_MODELS : ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
73+ CRON_DEFAULT_MODELS : ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf ' || 'llama' }}
7474 CRON_DEFAULT_DEVICES : samsung_galaxy_s22
7575 run : |
7676 set -eux
@@ -201,8 +201,8 @@ jobs:
201201 HF_MODEL_REPO=${{ matrix.model }}
202202 OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
203203
204+ # Convert HF checkpoint to ET via etLLM path
204205 if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
205- # Llama models on Hugging Face
206206 if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
207207 # SpinQuant
208208 # Download prequantized chceckpoint from Hugging Face
@@ -272,6 +272,21 @@ jobs:
272272 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
273273 --output_name="${OUT_ET_MODEL_NAME}.pte"
274274 ls -lh "${OUT_ET_MODEL_NAME}.pte"
275+ elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
276+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
277+ python -m examples.models.llama.export_llama \
278+ --model llama3_2 \
279+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
280+ --params "${DOWNLOADED_PATH}/params.json" \
281+ -kv \
282+ --use_sdpa_with_kv_cache \
283+ -d fp32 \
284+ -X \
285+ --xnnpack-extended-ops \
286+ -qmode 8da4w -G 32 -E 8,0 \
287+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
288+ --output_name="${OUT_ET_MODEL_NAME}.pte"
289+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
275290 elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
276291 export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
277292 export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
@@ -292,21 +307,75 @@ jobs:
292307 OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
293308 find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
294309 ls -lh "${OUT_ET_MODEL_NAME}.pte"
295- else
296- # By default, test with the Hugging Face model and the xnnpack recipe
297- DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
298- python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
299- ls -lh "${OUT_ET_MODEL_NAME}.pte"
300310 fi
301- else
302- echo "Unsupported model ${{ matrix.model }}"
303- exit 1
311+ elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
312+ if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
313+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
314+ python -m examples.models.llama.export_llama \
315+ --model qwen3-0_6b \
316+ --params examples/models/qwen3/0_6b_config.json \
317+ -kv \
318+ --use_sdpa_with_kv_cache \
319+ -d fp32 \
320+ -X \
321+ --xnnpack-extended-ops \
322+ -qmode 8da4w \
323+ -G 32 \
324+ -E 8,0 \
325+ --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
326+ --output_name="${OUT_ET_MODEL_NAME}.pte"
327+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
328+ fi
329+ fi
330+
331+ if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
332+ DOWNLOADED_PATH=$(
333+ bash .ci/scripts/download_hf_hub.sh \
334+ --model_id "${HF_MODEL_REPO}" \
335+ --files "tokenizer.json"
336+ )
337+ echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
338+
339+ # Install optimum-executorch
340+ git clone https://github.com/huggingface/optimum-executorch
341+ pushd optimum-executorch
342+ # There is no release yet, for CI stability, always test from the same commit on main
343+ git checkout 1c653dc49812fc431a22312c7295d97005d22e12
344+ python install_dev.py
345+ pip list
346+
347+ ARGS=(
348+ "--model" "${HF_MODEL_REPO}"
349+ "--task" "text-generation"
350+ "--recipe" "xnnpack"
351+ "--use_custom_sdpa"
352+ "--qlinear"
353+ "--qembedding"
354+ "--output_dir" ".."
355+ )
356+
357+ # Add conditional arguments based on model
358+ case "${HF_MODEL_REPO}" in
359+ *"google/gemma-3-1b-it"*)
360+ echo "--use_custom_kv_cache can not be used for HybridCache"
361+ ;;
362+ *)
363+ ARGS+=("--use_custom_kv_cache")
364+ ;;
365+ esac
366+
367+ optimum-cli export executorch "${ARGS[@]}"
368+ popd
369+
370+ mv model.pte ${OUT_ET_MODEL_NAME}.pte
371+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
304372 fi
305373
306- zip -j model.zip " ${OUT_ET_MODEL_NAME}.pte" " ${DOWNLOADED_PATH}/tokenizer.model"
374+ zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
307375 ls -lh model.zip
308- mkdir -p "${ARTIFACTS_DIR_NAME}"
309- mv model.zip "${ARTIFACTS_DIR_NAME}"
376+ mkdir -p ${ARTIFACTS_DIR_NAME}
377+ mv model.zip ${ARTIFACTS_DIR_NAME}
378+ ls -lh ${ARTIFACTS_DIR_NAME}
310379 elif [[ ${{ matrix.model }} == "llama" ]]; then
311380 # Install requirements for export_llama
312381 PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
0 commit comments