diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index bbf879295ae..bc9bbb8bae0 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -102,7 +102,7 @@ test_model() { bash examples/models/llama/install_requirements.sh # Test export_llm script: python3 -m extension.llm.export.export_llm. # Use Llama random checkpoint with Qwen 2.5 1.5b model configuration. - "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json + "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/config/1_5b_config.json rm "./${MODEL_NAME}.pte" return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears. fi @@ -110,7 +110,7 @@ test_model() { # Install requirements for export_llama bash examples/models/llama/install_requirements.sh # Test export_llm script: python3 -m extension.llm.export.export_llm. - "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json + "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config/config.json run_portable_executor_runner rm "./${MODEL_NAME}.pte" return diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index a79a900b2d8..2eab69eb88b 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -317,7 +317,7 @@ jobs: DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") python -m extension.llm.export.export_llm \ base.model_class=qwen3_0_6b \ - base.params=examples/models/qwen3/0_6b_config.json \ + base.params=examples/models/qwen3/config/0_6b_config.json \ model.use_kv_cache=true \ model.use_sdpa_with_kv_cache=true \ model.dtype_override=fp32 \ diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 6b1666da642..3db5abbefbd 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -322,7 +322,7 @@ jobs: DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") ${CONDA_RUN} python -m extension.llm.export.export_llm \ base.model_class=qwen3_0_6b \ - base.params=examples/models/qwen3/0_6b_config.json \ + base.params=examples/models/qwen3/config/0_6b_config.json \ model.use_kv_cache=true \ model.use_sdpa_with_kv_cache=true \ model.dtype_override=fp32 \ diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md index f05dd9990a2..00397e9f60f 100644 --- a/examples/models/deepseek-r1-distill-llama-8B/README.md +++ b/examples/models/deepseek-r1-distill-llama-8B/README.md @@ -53,17 +53,10 @@ torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth") 5. Generate a PTE file for use with the Llama runner. ``` python -m extension.llm.export.export_llm \ - base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \ - base.params=params.json \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - backend.xnnpack.enabled=True \ - quantization.qmode="8da4w" \ - quantization.group_size=128 \ - model.dtype_override="fp16" \ - base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ - quantization.embedding_quantize=\'4,32\' \ - export.output_name="DeepSeek-R1-Distill-Llama-8B.pte" + --config examples/models/deepseek-r1-distill-llama-8B/config/deepseek-r1-distill-llama-8B + +base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \ + +base.params=params.json \ + +export.output_name="DeepSeek-R1-Distill-Llama-8B.pte" ``` 6. Run the model on your desktop for validation or integrate with iOS/Android apps. Instructions for these are available in the Llama [README](../llama/README.md) starting at Step 3. diff --git a/examples/models/deepseek-r1-distill-llama-8B/config/deepseek_xnnpack_q8da4w.yaml b/examples/models/deepseek-r1-distill-llama-8B/config/deepseek_xnnpack_q8da4w.yaml new file mode 100644 index 00000000000..1da7c253d92 --- /dev/null +++ b/examples/models/deepseek-r1-distill-llama-8B/config/deepseek_xnnpack_q8da4w.yaml @@ -0,0 +1,16 @@ +base: + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + +model: + use_kv_cache: True + use_sdpa_with_kv_cache: True + dtype_override: fp16 + +backend: + xnnpack: + enabled: True + +quantization: + qmode: 8da4w + group_size: 128 + embedding_quantize: 4,32 diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index 3e6869e5c49..bbd2107ad74 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -168,14 +168,10 @@ LLAMA_CHECKPOINT=path/to/consolidated.00.pth LLAMA_PARAMS=path/to/params.json python -m extension.llm.export.export_llm \ - base.model_class="llama3_2" \ - base.checkpoint="${LLAMA_CHECKPOINT:?}" \ - base.params="${LLAMA_PARAMS:?}" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="bf16" \ - base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ - export.output_name="llama3_2.pte" + --config examples/models/llamaconfig/llama_bf16.yaml + +base.model_class="llama3_2" \ + +base.checkpoint="${LLAMA_CHECKPOINT:?}" \ + +base.params="${LLAMA_PARAMS:?}" \ ``` For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb). @@ -190,22 +186,10 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/consolidated.00.pth.pth LLAMA_PARAMS=path/to/spinquant/params.json python -m extension.llm.export.export_llm \ - base.model_class="llama3_2" \ - base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - base.params="${LLAMA_PARAMS:?}" \ - model.use_sdpa_with_kv_cache=True \ - backend.xnnpack.enabled=True \ - backend.xnnpack.extended_ops=True \ - base.preq_mode="preq_8da4w_out_8da8w" \ - base.preq_group_size=32 \ - export.max_seq_length=2048 \ - export.max_context_length=2048 \ - export.output_name="llama3_2.pte" \ - model.use_kv_cache=True \ - model.dtype_override="fp32" \ - base.preq_embedding_quantize=\'8,0\' \ - quantization.use_spin_quant="native" \ - base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' + --config examples/models/llama/config/llama_xnnpack_spinquant.yaml + +base.model_class="llama3_2" \ + +base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + +base.params="${LLAMA_PARAMS:?}" ``` For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb). @@ -219,23 +203,10 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/consolidated.00.pth.pth LLAMA_PARAMS=path/to/qlora/params.json python -m extension.llm.export.export_llm \ - base.model_class="llama3_2" \ - base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - base.params="${LLAMA_PARAMS:?}" \ - quantization.use_qat=True \ - base.use_lora=16 \ - base.preq_mode="preq_8da4w_out_8da8w" \ - base.preq_group_size=32 \ - base.preq_embedding_quantize=\'8,0\' \ - model.use_sdpa_with_kv_cache=True \ - model.use_kv_cache=True \ - backend.xnnpack.enabled=True \ - backend.xnnpack.extended_ops=True \ - model.dtype_override="fp32" \ - export.max_seq_length=2048 \ - export.max_context_length=2048 \ - export.output_name="llama3_2.pte" \ - base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' + --config examples/models/llama/config/llama_xnnpack_qat.yaml + +base.model_class="llama3_2" \ + +base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + +base.params="${LLAMA_PARAMS:?}" \ ``` For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb). @@ -246,20 +217,13 @@ You can export and run the original Llama 3 8B instruct model. 1. Llama 3 pretrained parameters can be downloaded from [Meta's official Llama 3 repository](https://github.com/meta-llama/llama3/). 2. Export model and generate `.pte` file - ``` - python -m extension.llm.export.export_llm \ - base.checkpoint= \ - base.params= \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - backend.xnnpack.enabled=True \ - quantization.qmode="8da4w" \ - quantization.group_size=128 \ - model.dtype_override="fp32" \ - base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ - quantization.embedding_quantize=\'4,32\' \ - export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" - ``` +``` +python -m extension.llm.export.export_llm \ + --config examples/models/llama/config/llama_q8da4w.yaml + +base.model_clas="llama3" + +base.checkpoint= \ + +base.params= +``` Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize=\'4,32\'` as shown above to further reduce the model size. @@ -276,20 +240,20 @@ You can export and run the original Llama 3 8B instruct model. Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the section of Common Issues and Mitigations below for solutions. 2. Build llama runner. - ``` - cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DBUILD_TESTING=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -Bcmake-out/examples/models/llama \ - examples/models/llama +``` +cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DBUILD_TESTING=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -Bcmake-out/examples/models/llama \ + examples/models/llama - cmake --build cmake-out/examples/models/llama -j16 --config Release - ``` +cmake --build cmake-out/examples/models/llama -j16 --config Release +``` 3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama/main.cpp#L18-L40). - ``` - cmake-out/examples/models/llama/llama_main --model_path= --tokenizer_path= --prompt= - ``` +``` +cmake-out/examples/models/llama/llama_main --model_path= --tokenizer_path= --prompt= +``` To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON` diff --git a/examples/models/llama/config/llama_bf16.yaml b/examples/models/llama/config/llama_bf16.yaml new file mode 100644 index 00000000000..8e89e8aa437 --- /dev/null +++ b/examples/models/llama/config/llama_bf16.yaml @@ -0,0 +1,7 @@ +base: + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + +model: + use_kv_cache: True + use_sdpa_with_kv_cache: True + dtype_override: bf16 \ No newline at end of file diff --git a/examples/models/llama/config/llama_q8da4w.yaml b/examples/models/llama/config/llama_q8da4w.yaml new file mode 100644 index 00000000000..476ae928c60 --- /dev/null +++ b/examples/models/llama/config/llama_q8da4w.yaml @@ -0,0 +1,11 @@ +base: + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + +model: + dtype_override: fp32 + +quantization: + qmode: 8da4w + group_size: 128 + embedding_quantize: 4,32 + \ No newline at end of file diff --git a/examples/models/llama/config/llama_xnnpack_qat.yaml b/examples/models/llama/config/llama_xnnpack_qat.yaml new file mode 100644 index 00000000000..2369ff1d279 --- /dev/null +++ b/examples/models/llama/config/llama_xnnpack_qat.yaml @@ -0,0 +1,23 @@ +base: + preq_mode: preq_8da4w_out_8da8w + preq_group_size: 32 + preq_embedding_quantize: 8,0 + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + use_lora: 16 + +model: + use_sdpa_with_kv_cache: True + use_kv_cache: True + dtype_override: fp32 + +export: + max_seq_length: 2048 + max_context_length: 2048 + +quantization: + use_qat: True + +backend: + xnnpack: + enabled: True + extended_ops: True \ No newline at end of file diff --git a/examples/models/llama/config/llama_xnnpack_spinquant.yaml b/examples/models/llama/config/llama_xnnpack_spinquant.yaml new file mode 100644 index 00000000000..441086d6f73 --- /dev/null +++ b/examples/models/llama/config/llama_xnnpack_spinquant.yaml @@ -0,0 +1,22 @@ +base: + preq_mode: preq_8da4w_out_8da8w + preq_group_size: 32 + preq_embedding_quantize: 8,0 + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + +model: + use_sdpa_with_kv_cache: True + use_kv_cache: True + dtype_override: fp32 + +export: + max_seq_length: 2048 + max_context_length: 2048 + +quantization: + use_spin_quant: native + +backend: + xnnpack: + enabled: True + extended_ops: True \ No newline at end of file diff --git a/examples/models/phi_4_mini/README.md b/examples/models/phi_4_mini/README.md index d168d54226e..8fb2f03ac4c 100644 --- a/examples/models/phi_4_mini/README.md +++ b/examples/models/phi_4_mini/README.md @@ -8,7 +8,7 @@ Phi-4-mini uses the same example code as Llama, while the checkpoint, model para All commands for exporting and running Llama on various backends should also be applicable to Phi-4-mini, by swapping the following args: ``` base.model_class="phi_4_mini" -base.params="examples/models/phi-4-mini/config.json" +base.params="examples/models/phi-4-mini/config/config.json" base.checkpoint= ``` @@ -33,16 +33,10 @@ Export to XNNPack, no quantization: PHI_CHECKPOINT=path/to/checkpoint.pth python -m extension.llm.export.export_llm \ - base.model_class="phi_4_mini" \ - base.checkpoint="${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \ - base.params="examples/models/phi-4-mini/config.json" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="fp32" \ - backend.xnnpack.enabled=True \ - base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \ - export.output_name="phi-4-mini.pte" \ - debug.verbose=True + --config config/phi_4_mini_xnnpack.yaml + +base.checkpoint="${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \ + +base.params="examples/models/phi-4-mini/config/config.json" \ + +export.output_name="phi-4-mini.pte" \ ``` Run using the executor runner: diff --git a/examples/models/phi_4_mini/config.json b/examples/models/phi_4_mini/config/config.json similarity index 100% rename from examples/models/phi_4_mini/config.json rename to examples/models/phi_4_mini/config/config.json diff --git a/examples/models/phi_4_mini/config/phi_4_mini_xnnpack.yaml b/examples/models/phi_4_mini/config/phi_4_mini_xnnpack.yaml new file mode 100644 index 00000000000..9355bd99f64 --- /dev/null +++ b/examples/models/phi_4_mini/config/phi_4_mini_xnnpack.yaml @@ -0,0 +1,12 @@ +base: + model_class: phi_4_mini + metadata: '{"get_bos_id":151643, "get_eos_ids":[151643]}' + +model: + use_kv_cache: True + use_sdpa_with_kv_cache: True + dtype_override: fp32 + +backend: + xnnpack: + enabled: True \ No newline at end of file diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md index 57784169ece..566a7a5c30b 100644 --- a/examples/models/qwen2_5/README.md +++ b/examples/models/qwen2_5/README.md @@ -8,7 +8,7 @@ Qwen 2.5 uses the same example code as Llama, while the checkpoint, model params All commands for exporting and running Llama on various backends should also be applicable to Qwen 2.5, by swapping the following args: ``` base.model_class="qwen2_5" -base.params="examples/models/qwen2_5/1_5b_config.json" +base.params="examples/models/qwen2_5/config/1_5b_config.json" base.checkpoint= ``` @@ -33,16 +33,11 @@ Export to XNNPack, no quantization: QWEN_CHECKPOINT=path/to/checkpoint.pth python -m extension.llm.export.export_llm \ - base.model_class="qwen2_5" \ - base.checkpoint="${QWEN_CHECKPOINT:?}" \ - base.params="examples/models/qwen2_5/1_5b_config.json" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="fp32" \ - backend.xnnpack.enabled=True \ - base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \ - export.output_name="qwen2_5-1_5b.pte" \ - debug.verbose=True + --config examples/models/qwen2_5/config/qwen2_5_xnnpack_q8da4w.yaml + +base.model_class="qwen2_5" \ + +base.checkpoint="${QWEN_CHECKPOINT:?}" \ + +base.params="examples/models/qwen2_5/1_5b_config.json" \ + +export.output_name="qwen2_5-1_5b.pte" \ ``` Run using the executor runner: diff --git a/examples/models/qwen2_5/1_5b_config.json b/examples/models/qwen2_5/config/1_5b_config.json similarity index 100% rename from examples/models/qwen2_5/1_5b_config.json rename to examples/models/qwen2_5/config/1_5b_config.json diff --git a/examples/models/qwen2_5/config/qwen2_5_xnnpack_q8da4w.yaml b/examples/models/qwen2_5/config/qwen2_5_xnnpack_q8da4w.yaml new file mode 100644 index 00000000000..0e5c6f7624e --- /dev/null +++ b/examples/models/qwen2_5/config/qwen2_5_xnnpack_q8da4w.yaml @@ -0,0 +1,11 @@ +base: + metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' + +model: + use_kv_cache: True + use_sdpa_with_kv_cache: True + dtype_override: fp32 + +backend: + xnnpack: + enabled: True \ No newline at end of file diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md index e24d8da2637..d2d89db93c2 100644 --- a/examples/models/qwen3/README.md +++ b/examples/models/qwen3/README.md @@ -8,7 +8,7 @@ Qwen 3 uses the same example code as our optimized Llama model, while the checkp All commands for exporting and running Llama on various backends should also be applicable to Qwen 3, by swapping the following args: ``` base.model_class=[qwen3_0_6b,qwen3_1_7b,qwen3_4b] -base.params=[examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json] +base.params=[examples/models/qwen3/config/0_6b_config.json,examples/models/qwen3/config/1_7b_config.json,examples/models/config/qwen3/4b_config.json] ``` ### Example export @@ -17,49 +17,29 @@ Here is a basic example for exporting Qwen 3, although please refer to the Llama Export 0.6b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ - base.model_class="qwen3_0_6b" \ - base.params="examples/models/qwen3/0_6b_config.json" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="fp32" \ - backend.xnnpack.enabled=True \ - backend.xnnpack.extended_ops=True \ - quantization.qmode="8da4w" \ - base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ - export.output_name="qwen3_0_6b.pte" \ - debug.verbose=True + --config examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml + +base.model_class="qwen3_0_6b" \ + +base.params="examples/models/qwen3/config/0_6b_config.json" \ + +export.output_name="qwen3_0_6b.pte" \ + ``` Export 1.7b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ - base.model_class="qwen3_1_7b" \ - base.params="examples/models/qwen3/1_7b_config.json" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="fp32" \ - backend.xnnpack.enabled=True \ - backend.xnnpack.extended_ops=True \ - quantization.qmode="8da4w" \ - base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ - export.output_name="qwen3_1_7b.pte" \ - debug.verbose=True + --config examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml + +base.model_class="qwen3_1_7b" \ + +base.params="examples/models/qwen3/config/1_7b_config.json" \ + +export.output_name="qwen3_1_7b.pte" \ ``` Export 4b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ - base.model_class="qwen3_4b" \ - base.params="examples/models/qwen3/4b_config.json" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="fp32" \ - backend.xnnpack.enabled=True \ - backend.xnnpack.extended_ops=True \ - quantization.qmode="8da4w" \ - base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ - export.output_name="qwen3_4b.pte" \ - debug.verbose=True + --config examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml + +base.model_class="qwen3_4b" \ + +base.params="examples/models/qwen3/config/4b_config.json" \ + +export.output_name="qwen3_4b.pte" \ ``` ### Example run diff --git a/examples/models/qwen3/0_6b_config.json b/examples/models/qwen3/config/0_6b_config.json similarity index 100% rename from examples/models/qwen3/0_6b_config.json rename to examples/models/qwen3/config/0_6b_config.json diff --git a/examples/models/qwen3/1_7b_config.json b/examples/models/qwen3/config/1_7b_config.json similarity index 100% rename from examples/models/qwen3/1_7b_config.json rename to examples/models/qwen3/config/1_7b_config.json diff --git a/examples/models/qwen3/4b_config.json b/examples/models/qwen3/config/4b_config.json similarity index 100% rename from examples/models/qwen3/4b_config.json rename to examples/models/qwen3/config/4b_config.json diff --git a/examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml b/examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml new file mode 100644 index 00000000000..60292b1ecdc --- /dev/null +++ b/examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml @@ -0,0 +1,15 @@ +base: + metadata: '{"get_bos_id": 151644, "get_eos_ids":[151645]}' + +model: + use_kv_cache: True + use_sdpa_with_kv_cache: True + dtype_override: fp32 + +quantization: + qmode: 8da4w + +backend: + xnnpack: + enabled: True + extended_ops: True \ No newline at end of file