Skip to content

Commit 0da5cb4

Browse files
jackzhxnghinriksnaer
authored andcommitted
Create yaml configs for documented export_llm use cases (pytorch#11938)
Migrate READMEs to use yaml llm configurations for export_llm
1 parent eadff1e commit 0da5cb4

21 files changed

+180
-137
lines changed

.ci/scripts/test_model.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,15 @@ test_model() {
102102
bash examples/models/llama/install_requirements.sh
103103
# Test export_llm script: python3 -m extension.llm.export.export_llm.
104104
# Use Llama random checkpoint with Qwen 2.5 1.5b model configuration.
105-
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json
105+
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/config/1_5b_config.json
106106
rm "./${MODEL_NAME}.pte"
107107
return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
108108
fi
109109
if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then
110110
# Install requirements for export_llama
111111
bash examples/models/llama/install_requirements.sh
112112
# Test export_llm script: python3 -m extension.llm.export.export_llm.
113-
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json
113+
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config/config.json
114114
run_portable_executor_runner
115115
rm "./${MODEL_NAME}.pte"
116116
return

.github/workflows/android-perf.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ jobs:
317317
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
318318
python -m extension.llm.export.export_llm \
319319
base.model_class=qwen3_0_6b \
320-
base.params=examples/models/qwen3/0_6b_config.json \
320+
base.params=examples/models/qwen3/config/0_6b_config.json \
321321
model.use_kv_cache=true \
322322
model.use_sdpa_with_kv_cache=true \
323323
model.dtype_override=fp32 \

.github/workflows/apple-perf.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ jobs:
322322
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
323323
${CONDA_RUN} python -m extension.llm.export.export_llm \
324324
base.model_class=qwen3_0_6b \
325-
base.params=examples/models/qwen3/0_6b_config.json \
325+
base.params=examples/models/qwen3/config/0_6b_config.json \
326326
model.use_kv_cache=true \
327327
model.use_sdpa_with_kv_cache=true \
328328
model.dtype_override=fp32 \

examples/models/deepseek-r1-distill-llama-8B/README.md

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -53,17 +53,10 @@ torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth")
5353
5. Generate a PTE file for use with the Llama runner.
5454
```
5555
python -m extension.llm.export.export_llm \
56-
base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \
57-
base.params=params.json \
58-
model.use_kv_cache=True \
59-
model.use_sdpa_with_kv_cache=True \
60-
backend.xnnpack.enabled=True \
61-
quantization.qmode="8da4w" \
62-
quantization.group_size=128 \
63-
model.dtype_override="fp16" \
64-
base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
65-
quantization.embedding_quantize=\'4,32\' \
66-
export.output_name="DeepSeek-R1-Distill-Llama-8B.pte"
56+
--config examples/models/deepseek-r1-distill-llama-8B/config/deepseek-r1-distill-llama-8B
57+
+base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \
58+
+base.params=params.json \
59+
+export.output_name="DeepSeek-R1-Distill-Llama-8B.pte"
6760
```
6861

6962
6. Run the model on your desktop for validation or integrate with iOS/Android apps. Instructions for these are available in the Llama [README](../llama/README.md) starting at Step 3.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
base:
2+
metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
3+
4+
model:
5+
use_kv_cache: True
6+
use_sdpa_with_kv_cache: True
7+
dtype_override: fp16
8+
9+
backend:
10+
xnnpack:
11+
enabled: True
12+
13+
quantization:
14+
qmode: 8da4w
15+
group_size: 128
16+
embedding_quantize: 4,32

examples/models/llama/README.md

Lines changed: 30 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -168,14 +168,10 @@ LLAMA_CHECKPOINT=path/to/consolidated.00.pth
168168
LLAMA_PARAMS=path/to/params.json
169169
170170
python -m extension.llm.export.export_llm \
171-
base.model_class="llama3_2" \
172-
base.checkpoint="${LLAMA_CHECKPOINT:?}" \
173-
base.params="${LLAMA_PARAMS:?}" \
174-
model.use_kv_cache=True \
175-
model.use_sdpa_with_kv_cache=True \
176-
model.dtype_override="bf16" \
177-
base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
178-
export.output_name="llama3_2.pte"
171+
--config examples/models/llamaconfig/llama_bf16.yaml
172+
+base.model_class="llama3_2" \
173+
+base.checkpoint="${LLAMA_CHECKPOINT:?}" \
174+
+base.params="${LLAMA_PARAMS:?}" \
179175
```
180176
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
181177

@@ -190,22 +186,10 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/consolidated.00.pth.pth
190186
LLAMA_PARAMS=path/to/spinquant/params.json
191187
192188
python -m extension.llm.export.export_llm \
193-
base.model_class="llama3_2" \
194-
base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
195-
base.params="${LLAMA_PARAMS:?}" \
196-
model.use_sdpa_with_kv_cache=True \
197-
backend.xnnpack.enabled=True \
198-
backend.xnnpack.extended_ops=True \
199-
base.preq_mode="preq_8da4w_out_8da8w" \
200-
base.preq_group_size=32 \
201-
export.max_seq_length=2048 \
202-
export.max_context_length=2048 \
203-
export.output_name="llama3_2.pte" \
204-
model.use_kv_cache=True \
205-
model.dtype_override="fp32" \
206-
base.preq_embedding_quantize=\'8,0\' \
207-
quantization.use_spin_quant="native" \
208-
base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
189+
--config examples/models/llama/config/llama_xnnpack_spinquant.yaml
190+
+base.model_class="llama3_2" \
191+
+base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
192+
+base.params="${LLAMA_PARAMS:?}"
209193
```
210194
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
211195

@@ -219,23 +203,10 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/consolidated.00.pth.pth
219203
LLAMA_PARAMS=path/to/qlora/params.json
220204
221205
python -m extension.llm.export.export_llm \
222-
base.model_class="llama3_2" \
223-
base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
224-
base.params="${LLAMA_PARAMS:?}" \
225-
quantization.use_qat=True \
226-
base.use_lora=16 \
227-
base.preq_mode="preq_8da4w_out_8da8w" \
228-
base.preq_group_size=32 \
229-
base.preq_embedding_quantize=\'8,0\' \
230-
model.use_sdpa_with_kv_cache=True \
231-
model.use_kv_cache=True \
232-
backend.xnnpack.enabled=True \
233-
backend.xnnpack.extended_ops=True \
234-
model.dtype_override="fp32" \
235-
export.max_seq_length=2048 \
236-
export.max_context_length=2048 \
237-
export.output_name="llama3_2.pte" \
238-
base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
206+
--config examples/models/llama/config/llama_xnnpack_qat.yaml
207+
+base.model_class="llama3_2" \
208+
+base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
209+
+base.params="${LLAMA_PARAMS:?}" \
239210
```
240211
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
241212

@@ -246,20 +217,13 @@ You can export and run the original Llama 3 8B instruct model.
246217
1. Llama 3 pretrained parameters can be downloaded from [Meta's official Llama 3 repository](https://github.com/meta-llama/llama3/).
247218

248219
2. Export model and generate `.pte` file
249-
```
250-
python -m extension.llm.export.export_llm \
251-
base.checkpoint=<consolidated.00.pth.pth> \
252-
base.params=<params.json> \
253-
model.use_kv_cache=True \
254-
model.use_sdpa_with_kv_cache=True \
255-
backend.xnnpack.enabled=True \
256-
quantization.qmode="8da4w" \
257-
quantization.group_size=128 \
258-
model.dtype_override="fp32" \
259-
base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
260-
quantization.embedding_quantize=\'4,32\' \
261-
export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
262-
```
220+
```
221+
python -m extension.llm.export.export_llm \
222+
--config examples/models/llama/config/llama_q8da4w.yaml
223+
+base.model_clas="llama3"
224+
+base.checkpoint=<consolidated.00.pth.pth> \
225+
+base.params=<params.json>
226+
```
263227
Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize=\'4,32\'` as shown above to further reduce the model size.
264228

265229

@@ -276,20 +240,20 @@ You can export and run the original Llama 3 8B instruct model.
276240
Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the section of Common Issues and Mitigations below for solutions.
277241
278242
2. Build llama runner.
279-
```
280-
cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
281-
-DBUILD_TESTING=OFF \
282-
-DCMAKE_BUILD_TYPE=Release \
283-
-Bcmake-out/examples/models/llama \
284-
examples/models/llama
243+
```
244+
cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
245+
-DBUILD_TESTING=OFF \
246+
-DCMAKE_BUILD_TYPE=Release \
247+
-Bcmake-out/examples/models/llama \
248+
examples/models/llama
285249

286-
cmake --build cmake-out/examples/models/llama -j16 --config Release
287-
```
250+
cmake --build cmake-out/examples/models/llama -j16 --config Release
251+
```
288252
289253
3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama/main.cpp#L18-L40).
290-
```
291-
cmake-out/examples/models/llama/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.model> --prompt=<prompt>
292-
```
254+
```
255+
cmake-out/examples/models/llama/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.model> --prompt=<prompt>
256+
```
293257
294258
To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON`
295259
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
base:
2+
metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
3+
4+
model:
5+
use_kv_cache: True
6+
use_sdpa_with_kv_cache: True
7+
dtype_override: bf16
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
base:
2+
metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
3+
4+
model:
5+
dtype_override: fp32
6+
7+
quantization:
8+
qmode: 8da4w
9+
group_size: 128
10+
embedding_quantize: 4,32
11+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
base:
2+
preq_mode: preq_8da4w_out_8da8w
3+
preq_group_size: 32
4+
preq_embedding_quantize: 8,0
5+
metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
6+
use_lora: 16
7+
8+
model:
9+
use_sdpa_with_kv_cache: True
10+
use_kv_cache: True
11+
dtype_override: fp32
12+
13+
export:
14+
max_seq_length: 2048
15+
max_context_length: 2048
16+
17+
quantization:
18+
use_qat: True
19+
20+
backend:
21+
xnnpack:
22+
enabled: True
23+
extended_ops: True
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
base:
2+
preq_mode: preq_8da4w_out_8da8w
3+
preq_group_size: 32
4+
preq_embedding_quantize: 8,0
5+
metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
6+
7+
model:
8+
use_sdpa_with_kv_cache: True
9+
use_kv_cache: True
10+
dtype_override: fp32
11+
12+
export:
13+
max_seq_length: 2048
14+
max_context_length: 2048
15+
16+
quantization:
17+
use_spin_quant: native
18+
19+
backend:
20+
xnnpack:
21+
enabled: True
22+
extended_ops: True

0 commit comments

Comments
 (0)