@@ -214,23 +214,23 @@ jobs:
214214 --files "tokenizer.model" "params.json" "consolidated.00.pth"
215215 )
216216 # Export using ExecuTorch's model definition
217- python -m examples.models.llama.export_llama \
218- --model "llama3_2" \
219- -- checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
220- -- params "${DOWNLOADED_PATH}/params.json" \
221- -- use_sdpa_with_kv_cache \
222- -X \
223- -- xnnpack-extended-ops \
224- -- preq_mode 8da4w_output_8da8w \
225- -- preq_group_size 32 \
226- -- max_seq_length 2048 \
227- -- max_context_length 2048 \
228- -- output_name "${OUT_ET_MODEL_NAME}.pte" \
229- -kv \
230- -d fp32 \
231- -- preq_embedding_quantize 8,0 \
232- -- use_spin_quant native \
233- -- metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
217+ python -m extension.llm.export.export_llm \
218+ base.model_class= "llama3_2" \
219+ base. checkpoint= "${DOWNLOADED_PATH}/consolidated.00.pth" \
220+ base. params= "${DOWNLOADED_PATH}/params.json" \
221+ model. use_sdpa_with_kv_cache=true \
222+ backend.xnnpack.enabled=true \
223+ backend. xnnpack.extended_ops=true \
224+ base. preq_mode=" 8da4w_output_8da8w" \
225+ base. preq_group_size= 32 \
226+ export. max_seq_length= 2048 \
227+ export. max_context_length= 2048 \
228+ export. output_name= "${OUT_ET_MODEL_NAME}.pte" \
229+ model.use_kv_cache=true \
230+ model.dtype_override= fp32 \
231+ base. preq_embedding_quantize=\' 8,0\' \
232+ quantization. use_spin_quant= native \
233+ base. metadata= '{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
234234 ls -lh "${OUT_ET_MODEL_NAME}.pte"
235235 elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
236236 # QAT + LoRA
@@ -241,53 +241,55 @@ jobs:
241241 --files "tokenizer.model" "params.json" "consolidated.00.pth"
242242 )
243243 # Export using ExecuTorch's model definition
244- python -m examples.models.llama.export_llama \
245- --model "llama3_2" \
246- -- checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
247- -- params "${DOWNLOADED_PATH}/params.json" \
248- -qat \
249- -lora 16 \
250- -- preq_mode 8da4w_output_8da8w \
251- -- preq_group_size 32 \
252- -- preq_embedding_quantize 8,0 \
253- -- use_sdpa_with_kv_cache \
254- -kv \
255- -X \
256- -- xnnpack-extended-ops \
257- -d fp32 \
258- -- max_seq_length 2048 \
259- -- max_context_length 2048 \
260- -- output_name "${OUT_ET_MODEL_NAME}.pte" \
261- -- metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
244+ python -m extension.llm.export.export_llm \
245+ base.model_class= "llama3_2" \
246+ base. checkpoint= "${DOWNLOADED_PATH}/consolidated.00.pth" \
247+ base. params= "${DOWNLOADED_PATH}/params.json" \
248+ quantization.use_qat=true \
249+ base.use_lora= 16 \
250+ base. preq_mode=" 8da4w_output_8da8w" \
251+ base. preq_group_size= 32 \
252+ base. preq_embedding_quantize=\' 8,0\' \
253+ model. use_sdpa_with_kv_cache=true \
254+ model.use_kv_cache=true \
255+ backend.xnnpack.enabled=true \
256+ backend. xnnpack.extended_ops=true \
257+ model.dtype_override= fp32 \
258+ export. max_seq_length= 2048 \
259+ export. max_context_length= 2048 \
260+ export. output_name= "${OUT_ET_MODEL_NAME}.pte" \
261+ base. metadata= '{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
262262 ls -lh "${OUT_ET_MODEL_NAME}.pte"
263263 elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
264264 # Original BF16 version, without any quantization
265265 DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
266- python -m examples.models.llama.export_llama \
267- --model "llama3_2" \
268- -- checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
269- -- params "${DOWNLOADED_PATH}/params.json" \
270- -kv \
271- -- use_sdpa_with_kv_cache \
272- -X \
273- -d bf16 \
274- -- metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
275- -- output_name="${OUT_ET_MODEL_NAME}.pte"
266+ python -m extension.llm.export.export_llm \
267+ base.model_class= "llama3_2" \
268+ base. checkpoint= "${DOWNLOADED_PATH}/consolidated.00.pth" \
269+ base. params= "${DOWNLOADED_PATH}/params.json" \
270+ model.use_kv_cache=true \
271+ model. use_sdpa_with_kv_cache=true \
272+ backend.xnnpack.enabled=true \
273+ model.dtype_override= bf16 \
274+ base. metadata= '{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
275+ export. output_name="${OUT_ET_MODEL_NAME}.pte"
276276 ls -lh "${OUT_ET_MODEL_NAME}.pte"
277277 elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
278278 DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
279- python -m examples.models.llama.export_llama \
280- --model llama3_2 \
281- --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
282- --params "${DOWNLOADED_PATH}/params.json" \
283- -kv \
284- --use_sdpa_with_kv_cache \
285- -d fp32 \
286- -X \
287- --xnnpack-extended-ops \
288- -qmode 8da4w -G 32 -E 8,0 \
289- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
290- --output_name="${OUT_ET_MODEL_NAME}.pte"
279+ python -m extension.llm.export.export_llm \
280+ base.model_class=llama3_2 \
281+ base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
282+ base.params="${DOWNLOADED_PATH}/params.json" \
283+ model.use_kv_cache=true \
284+ model.use_sdpa_with_kv_cache=true \
285+ model.dtype_override=fp32 \
286+ backend.xnnpack.enabled=true \
287+ backend.xnnpack.extended_ops=true \
288+ quantization.qmode=8da4w \
289+ quantization.group_size=32 \
290+ quantization.embedding_quantize=\'8,0\' \
291+ base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
292+ export.output_name="${OUT_ET_MODEL_NAME}.pte"
291293 ls -lh "${OUT_ET_MODEL_NAME}.pte"
292294 elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
293295 export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
@@ -313,19 +315,19 @@ jobs:
313315 elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
314316 if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
315317 DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
316- python -m examples.models.llama.export_llama \
317- --model qwen3-0_6b \
318- -- params examples/models/qwen3/0_6b_config.json \
319- -kv \
320- -- use_sdpa_with_kv_cache \
321- -d fp32 \
322- -X \
323- -- xnnpack-extended-ops \
324- - qmode 8da4w \
325- -G 32 \
326- -E 8,0 \
327- -- metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
328- -- output_name="${OUT_ET_MODEL_NAME}.pte"
318+ python -m extension.llm.export.export_llm \
319+ base.model_class= qwen3-0_6b \
320+ base. params= examples/models/qwen3/0_6b_config.json \
321+ model.use_kv_cache=true \
322+ model. use_sdpa_with_kv_cache=true \
323+ model.dtype_override= fp32 \
324+ backend.xnnpack.enabled=true \
325+ backend. xnnpack.extended_ops=true \
326+ quantization. qmode= 8da4w \
327+ quantization.group_size= 32 \
328+ quantization.embedding_quantize=\' 8,0\' \
329+ base. metadata= '{"get_bos_id":151644,"get_eos_ids":[151645]}' \
330+ export. output_name="${OUT_ET_MODEL_NAME}.pte"
329331 ls -lh "${OUT_ET_MODEL_NAME}.pte"
330332 fi
331333 fi
0 commit comments