70
70
# Separate default values from the workflow dispatch. To ensure defaults are accessible
71
71
# during scheduled runs and to provide flexibility for different defaults between
72
72
# on-demand and periodic benchmarking.
73
- CRON_DEFAULT_MODELS : ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
73
+ CRON_DEFAULT_MODELS : ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf ' || 'llama' }}
74
74
CRON_DEFAULT_DEVICES : samsung_galaxy_s22
75
75
run : |
76
76
set -eux
@@ -201,8 +201,8 @@ jobs:
201
201
HF_MODEL_REPO=${{ matrix.model }}
202
202
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
203
203
204
+ # Convert HF checkpoint to ET via etLLM path
204
205
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
205
- # Llama models on Hugging Face
206
206
if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
207
207
# SpinQuant
208
208
# Download prequantized chceckpoint from Hugging Face
@@ -272,6 +272,21 @@ jobs:
272
272
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
273
273
--output_name="${OUT_ET_MODEL_NAME}.pte"
274
274
ls -lh "${OUT_ET_MODEL_NAME}.pte"
275
+ elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
276
+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
277
+ python -m examples.models.llama.export_llama \
278
+ --model llama3_2 \
279
+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
280
+ --params "${DOWNLOADED_PATH}/params.json" \
281
+ -kv \
282
+ --use_sdpa_with_kv_cache \
283
+ -d fp32 \
284
+ -X \
285
+ --xnnpack-extended-ops \
286
+ -qmode 8da4w -G 32 -E 8,0 \
287
+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
288
+ --output_name="${OUT_ET_MODEL_NAME}.pte"
289
+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
275
290
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
276
291
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
277
292
export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
@@ -292,21 +307,75 @@ jobs:
292
307
OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
293
308
find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
294
309
ls -lh "${OUT_ET_MODEL_NAME}.pte"
295
- else
296
- # By default, test with the Hugging Face model and the xnnpack recipe
297
- DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
298
- python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
299
- ls -lh "${OUT_ET_MODEL_NAME}.pte"
300
310
fi
301
- else
302
- echo "Unsupported model ${{ matrix.model }}"
303
- exit 1
311
+ elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
312
+ if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
313
+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
314
+ python -m examples.models.llama.export_llama \
315
+ --model qwen3-0_6b \
316
+ --params examples/models/qwen3/0_6b_config.json \
317
+ -kv \
318
+ --use_sdpa_with_kv_cache \
319
+ -d fp32 \
320
+ -X \
321
+ --xnnpack-extended-ops \
322
+ -qmode 8da4w \
323
+ -G 32 \
324
+ -E 8,0 \
325
+ --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
326
+ --output_name="${OUT_ET_MODEL_NAME}.pte"
327
+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
328
+ fi
329
+ fi
330
+
331
+ if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
332
+ DOWNLOADED_PATH=$(
333
+ bash .ci/scripts/download_hf_hub.sh \
334
+ --model_id "${HF_MODEL_REPO}" \
335
+ --files "tokenizer.json"
336
+ )
337
+ echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
338
+
339
+ # Install optimum-executorch
340
+ git clone https://github.com/huggingface/optimum-executorch
341
+ pushd optimum-executorch
342
+ # There is no release yet, for CI stability, always test from the same commit on main
343
+ git checkout 1c653dc49812fc431a22312c7295d97005d22e12
344
+ python install_dev.py
345
+ pip list
346
+
347
+ ARGS=(
348
+ "--model" "${HF_MODEL_REPO}"
349
+ "--task" "text-generation"
350
+ "--recipe" "xnnpack"
351
+ "--use_custom_sdpa"
352
+ "--qlinear"
353
+ "--qembedding"
354
+ "--output_dir" ".."
355
+ )
356
+
357
+ # Add conditional arguments based on model
358
+ case "${HF_MODEL_REPO}" in
359
+ *"google/gemma-3-1b-it"*)
360
+ echo "--use_custom_kv_cache can not be used for HybridCache"
361
+ ;;
362
+ *)
363
+ ARGS+=("--use_custom_kv_cache")
364
+ ;;
365
+ esac
366
+
367
+ optimum-cli export executorch "${ARGS[@]}"
368
+ popd
369
+
370
+ mv model.pte ${OUT_ET_MODEL_NAME}.pte
371
+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
304
372
fi
305
373
306
- zip -j model.zip " ${OUT_ET_MODEL_NAME}.pte" " ${DOWNLOADED_PATH}/tokenizer.model"
374
+ zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
307
375
ls -lh model.zip
308
- mkdir -p "${ARTIFACTS_DIR_NAME}"
309
- mv model.zip "${ARTIFACTS_DIR_NAME}"
376
+ mkdir -p ${ARTIFACTS_DIR_NAME}
377
+ mv model.zip ${ARTIFACTS_DIR_NAME}
378
+ ls -lh ${ARTIFACTS_DIR_NAME}
310
379
elif [[ ${{ matrix.model }} == "llama" ]]; then
311
380
# Install requirements for export_llama
312
381
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
0 commit comments