pytorch · jackzhxng · Jun 26, 2025 · Jun 24, 2025 · Jun 25, 2025 · Jun 25, 2025
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
@@ -102,15 +102,15 @@ test_model() {
       bash examples/models/llama/install_requirements.sh
       # Test export_llm script: python3 -m extension.llm.export.export_llm.
       # Use Llama random checkpoint with Qwen 2.5 1.5b model configuration.
-      "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json
+      "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/config/1_5b_config.json
       rm "./${MODEL_NAME}.pte"
       return  # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
   fi
   if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then
       # Install requirements for export_llama
       bash examples/models/llama/install_requirements.sh
       # Test export_llm script: python3 -m extension.llm.export.export_llm.
-      "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json
+      "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config/config.json
       run_portable_executor_runner
       rm "./${MODEL_NAME}.pte"
       return

diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -317,7 +317,7 @@ jobs:
                 DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
                 python -m extension.llm.export.export_llm \
                   base.model_class=qwen3_0_6b \
-                  base.params=examples/models/qwen3/0_6b_config.json \
+                  base.params=examples/models/qwen3/config/0_6b_config.json \
                   model.use_kv_cache=true \
                   model.use_sdpa_with_kv_cache=true \
                   model.dtype_override=fp32 \

diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
@@ -322,7 +322,7 @@ jobs:
                 DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
                 ${CONDA_RUN} python -m extension.llm.export.export_llm \
                   base.model_class=qwen3_0_6b \
-                  base.params=examples/models/qwen3/0_6b_config.json \
+                  base.params=examples/models/qwen3/config/0_6b_config.json \
                   model.use_kv_cache=true \
                   model.use_sdpa_with_kv_cache=true \
                   model.dtype_override=fp32 \

@@ -53,17 +53,10 @@ torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth")
 5. Generate a PTE file for use with the Llama runner.
 ```
 python -m extension.llm.export.export_llm \
-    base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \
-	base.params=params.json \
-	model.use_kv_cache=True \
-	model.use_sdpa_with_kv_cache=True \
-	backend.xnnpack.enabled=True \
-	quantization.qmode="8da4w" \
-	quantization.group_size=128 \
-	model.dtype_override="fp16" \
-	base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
-	quantization.embedding_quantize=\'4,32\' \
-	export.output_name="DeepSeek-R1-Distill-Llama-8B.pte"
+    --config examples/models/deepseek-r1-distill-llama-8B/config/deepseek-r1-distill-llama-8B
+    +base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \
+	+base.params=params.json \
+	+export.output_name="DeepSeek-R1-Distill-Llama-8B.pte"
 ```
 
 6. Run the model on your desktop for validation or integrate with iOS/Android apps. Instructions for these are available in the Llama [README](../llama/README.md) starting at Step 3.
@@ -0,0 +1,16 @@
+base:
+  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+
+model:
+  use_kv_cache: True
+  use_sdpa_with_kv_cache: True
+  dtype_override: fp16
+
+backend:
+  xnnpack:
+    enabled: True
+
+quantization:
+  qmode: 8da4w
+  group_size: 128
+  embedding_quantize: 4,32
@@ -168,14 +168,10 @@ LLAMA_CHECKPOINT=path/to/consolidated.00.pth
 LLAMA_PARAMS=path/to/params.json
 
 python -m extension.llm.export.export_llm \
-  base.model_class="llama3_2" \
-  base.checkpoint="${LLAMA_CHECKPOINT:?}" \
-  base.params="${LLAMA_PARAMS:?}" \
-  model.use_kv_cache=True \
-  model.use_sdpa_with_kv_cache=True \
-  model.dtype_override="bf16" \
-  base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
-  export.output_name="llama3_2.pte"
+  --config examples/models/llamaconfig/llama_bf16.yaml
+  +base.model_class="llama3_2" \
+  +base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+  +base.params="${LLAMA_PARAMS:?}" \
 ```
 For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
 
@@ -190,22 +186,10 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/consolidated.00.pth.pth
 LLAMA_PARAMS=path/to/spinquant/params.json
 
 python -m extension.llm.export.export_llm \
-   base.model_class="llama3_2" \
-   base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
-   base.params="${LLAMA_PARAMS:?}" \
-   model.use_sdpa_with_kv_cache=True \
-   backend.xnnpack.enabled=True \
-   backend.xnnpack.extended_ops=True \
-   base.preq_mode="preq_8da4w_out_8da8w" \
-   base.preq_group_size=32 \
-   export.max_seq_length=2048 \
-   export.max_context_length=2048 \
-   export.output_name="llama3_2.pte" \
-   model.use_kv_cache=True \
-   model.dtype_override="fp32" \
-   base.preq_embedding_quantize=\'8,0\' \
-   quantization.use_spin_quant="native" \
-   base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
+  --config examples/models/llama/config/llama_xnnpack_spinquant.yaml
+  +base.model_class="llama3_2" \
+  +base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
+  +base.params="${LLAMA_PARAMS:?}"
 ```
 For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
 
@@ -219,23 +203,10 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/consolidated.00.pth.pth
 LLAMA_PARAMS=path/to/qlora/params.json
 
 python -m extension.llm.export.export_llm \
-   base.model_class="llama3_2" \
-   base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
-   base.params="${LLAMA_PARAMS:?}" \
-   quantization.use_qat=True \
-   base.use_lora=16 \
-   base.preq_mode="preq_8da4w_out_8da8w" \
-   base.preq_group_size=32 \
-   base.preq_embedding_quantize=\'8,0\' \
-   model.use_sdpa_with_kv_cache=True \
-   model.use_kv_cache=True \
-   backend.xnnpack.enabled=True \
-   backend.xnnpack.extended_ops=True \
-   model.dtype_override="fp32" \
-   export.max_seq_length=2048 \
-   export.max_context_length=2048 \
-   export.output_name="llama3_2.pte" \
-   base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
+    --config examples/models/llama/config/llama_xnnpack_qat.yaml
+    +base.model_class="llama3_2" \
+    +base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
+    +base.params="${LLAMA_PARAMS:?}" \
 ```
 For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
 
@@ -246,20 +217,13 @@ You can export and run the original Llama 3 8B instruct model.
 1. Llama 3 pretrained parameters can be downloaded from [Meta's official Llama 3 repository](https://github.com/meta-llama/llama3/).
 
 2. Export model and generate `.pte` file
-    ```
-    python -m extension.llm.export.export_llm \
-	    base.checkpoint=<consolidated.00.pth.pth> \
-		base.params=<params.json> \
-		model.use_kv_cache=True \
-		model.use_sdpa_with_kv_cache=True \
-		backend.xnnpack.enabled=True \
-		quantization.qmode="8da4w" \
-		quantization.group_size=128 \
-		model.dtype_override="fp32" \
-		base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
-		quantization.embedding_quantize=\'4,32\' \
-		export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
-    ```
+```
+python -m extension.llm.export.export_llm \
+    --config examples/models/llama/config/llama_q8da4w.yaml
+    +base.model_clas="llama3"
+    +base.checkpoint=<consolidated.00.pth.pth> \
+    +base.params=<params.json>
+```
     Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize=\'4,32\'` as shown above to further reduce the model size.
 
 
@@ -276,20 +240,20 @@ You can export and run the original Llama 3 8B instruct model.
 Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the section of Common Issues and Mitigations below for solutions.
 
 2. Build llama runner.
-    ```
-    cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DBUILD_TESTING=OFF \
-        -DCMAKE_BUILD_TYPE=Release \
-        -Bcmake-out/examples/models/llama \
-        examples/models/llama
+```
+cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+	-DBUILD_TESTING=OFF \
+	-DCMAKE_BUILD_TYPE=Release \
+	-Bcmake-out/examples/models/llama \
+	examples/models/llama
 
-    cmake --build cmake-out/examples/models/llama -j16 --config Release
-    ```
+cmake --build cmake-out/examples/models/llama -j16 --config Release
+```
 
 3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama/main.cpp#L18-L40).
-    ```
-    cmake-out/examples/models/llama/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.model> --prompt=<prompt>
-    ```
+```
+cmake-out/examples/models/llama/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.model> --prompt=<prompt>
+```
 
 To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON`
 

@@ -0,0 +1,7 @@
+base:
+  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+
+model:
+  use_kv_cache: True
+  use_sdpa_with_kv_cache: True
+  dtype_override: bf16
@@ -0,0 +1,11 @@
+base:
+  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+
+model:
+  dtype_override: fp32
+
+quantization:
+  qmode: 8da4w
+  group_size: 128
+  embedding_quantize: 4,32
+
@@ -0,0 +1,23 @@
+base:
+  preq_mode: preq_8da4w_out_8da8w
+  preq_group_size: 32
+  preq_embedding_quantize: 8,0
+  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+  use_lora: 16
+
+model:
+  use_sdpa_with_kv_cache: True
+  use_kv_cache: True
+  dtype_override: fp32
+
+export:
+  max_seq_length: 2048
+  max_context_length: 2048
+
+quantization:
+  use_qat: True
+
+backend:
+  xnnpack:
+    enabled: True
+    extended_ops: True
@@ -0,0 +1,22 @@
+base:
+  preq_mode: preq_8da4w_out_8da8w
+  preq_group_size: 32
+  preq_embedding_quantize: 8,0
+  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+
+model:
+  use_sdpa_with_kv_cache: True
+  use_kv_cache: True
+  dtype_override: fp32
+
+export:
+  max_seq_length: 2048
+  max_context_length: 2048
+
+quantization:
+  use_spin_quant: native
+
+backend:
+  xnnpack:
+    enabled: True
+    extended_ops: True
@@ -8,7 +8,7 @@ Phi-4-mini uses the same example code as Llama, while the checkpoint, model para
 All commands for exporting and running Llama on various backends should also be applicable to Phi-4-mini, by swapping the following args:
 ```
 base.model_class="phi_4_mini"
-base.params="examples/models/phi-4-mini/config.json"
+base.params="examples/models/phi-4-mini/config/config.json"
 base.checkpoint=<path-to-meta-checkpoint>
 ```
 
@@ -33,16 +33,10 @@ Export to XNNPack, no quantization:
 PHI_CHECKPOINT=path/to/checkpoint.pth
 
 python -m extension.llm.export.export_llm \
-  base.model_class="phi_4_mini" \
-  base.checkpoint="${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \
-  base.params="examples/models/phi-4-mini/config.json" \
-  model.use_kv_cache=True \
-  model.use_sdpa_with_kv_cache=True \
-  model.dtype_override="fp32" \
-  backend.xnnpack.enabled=True \
-  base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \
-  export.output_name="phi-4-mini.pte" \
-  debug.verbose=True
+  --config config/phi_4_mini_xnnpack.yaml
+  +base.checkpoint="${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \
+  +base.params="examples/models/phi-4-mini/config/config.json" \
+  +export.output_name="phi-4-mini.pte" \
 ```
 
 Run using the executor runner:

@@ -0,0 +1,12 @@
+base:
+  model_class: phi_4_mini
+  metadata: '{"get_bos_id":151643, "get_eos_ids":[151643]}'
+
+model:
+  use_kv_cache: True
+  use_sdpa_with_kv_cache: True
+  dtype_override: fp32
+
+backend:
+  xnnpack:
+    enabled: True
@@ -8,7 +8,7 @@ Qwen 2.5 uses the same example code as Llama, while the checkpoint, model params
 All commands for exporting and running Llama on various backends should also be applicable to Qwen 2.5, by swapping the following args:
 ```
 base.model_class="qwen2_5"
-base.params="examples/models/qwen2_5/1_5b_config.json"
+base.params="examples/models/qwen2_5/config/1_5b_config.json"
 base.checkpoint=<path-to-meta-checkpoint>
 ```
 
@@ -33,16 +33,11 @@ Export to XNNPack, no quantization:
 QWEN_CHECKPOINT=path/to/checkpoint.pth
 
 python -m extension.llm.export.export_llm \
-  base.model_class="qwen2_5" \
-  base.checkpoint="${QWEN_CHECKPOINT:?}" \
-  base.params="examples/models/qwen2_5/1_5b_config.json" \
-  model.use_kv_cache=True \
-  model.use_sdpa_with_kv_cache=True \
-  model.dtype_override="fp32" \
-  backend.xnnpack.enabled=True \
-  base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \
-  export.output_name="qwen2_5-1_5b.pte" \
-  debug.verbose=True
+  --config examples/models/qwen2_5/config/qwen2_5_xnnpack_q8da4w.yaml
+  +base.model_class="qwen2_5" \
+  +base.checkpoint="${QWEN_CHECKPOINT:?}" \
+  +base.params="examples/models/qwen2_5/1_5b_config.json" \
+  +export.output_name="qwen2_5-1_5b.pte" \
 ```
 
 Run using the executor runner:

@@ -0,0 +1,11 @@
+base:
+  metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}'
+
+model:
+  use_kv_cache: True
+  use_sdpa_with_kv_cache: True
+  dtype_override: fp32
+
+backend:
+  xnnpack:
+    enabled: True