Skip to content

Commit 8f9faa2

Browse files
committed
Update
[ghstack-poisoned]
1 parent a3daf98 commit 8f9faa2

File tree

6 files changed

+44
-39
lines changed

6 files changed

+44
-39
lines changed

.github/workflows/android-perf.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -228,9 +228,9 @@ jobs:
228228
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
229229
model.use_kv_cache=true \
230230
model.dtype_override=fp32 \
231-
base.preq_embedding_quantize="8,0" \
231+
base.preq_embedding_quantize='8,0' \
232232
quantization.use_spin_quant=native \
233-
base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
233+
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
234234
ls -lh "${OUT_ET_MODEL_NAME}.pte"
235235
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
236236
# QAT + LoRA
@@ -249,7 +249,7 @@ jobs:
249249
base.use_lora=16 \
250250
base.preq_mode="8da4w_output_8da8w" \
251251
base.preq_group_size=32 \
252-
base.preq_embedding_quantize="8,0" \
252+
base.preq_embedding_quantize='8,0' \
253253
model.use_sdpa_with_kv_cache=true \
254254
model.use_kv_cache=true \
255255
backend.xnnpack.enabled=true \
@@ -258,7 +258,7 @@ jobs:
258258
export.max_seq_length=2048 \
259259
export.max_context_length=2048 \
260260
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
261-
base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
261+
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
262262
ls -lh "${OUT_ET_MODEL_NAME}.pte"
263263
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
264264
# Original BF16 version, without any quantization
@@ -271,7 +271,7 @@ jobs:
271271
model.use_sdpa_with_kv_cache=true \
272272
backend.xnnpack.enabled=true \
273273
model.dtype_override=bf16 \
274-
base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
274+
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
275275
export.output_name="${OUT_ET_MODEL_NAME}.pte"
276276
ls -lh "${OUT_ET_MODEL_NAME}.pte"
277277
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
@@ -287,8 +287,8 @@ jobs:
287287
backend.xnnpack.extended_ops=true \
288288
quantization.qmode=8da4w \
289289
quantization.group_size=32 \
290-
quantization.embedding_quantize="8,0" \
291-
base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
290+
quantization.embedding_quantize='8,0' \
291+
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
292292
export.output_name="${OUT_ET_MODEL_NAME}.pte"
293293
ls -lh "${OUT_ET_MODEL_NAME}.pte"
294294
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
@@ -325,8 +325,8 @@ jobs:
325325
backend.xnnpack.extended_ops=true \
326326
quantization.qmode=8da4w \
327327
quantization.group_size=32 \
328-
quantization.embedding_quantize="8,0" \
329-
base.metadata="\{\"get_bos_id\":151644,\"get_eos_ids\":[151645]\}" \
328+
quantization.embedding_quantize='8,0' \
329+
base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \
330330
export.output_name="${OUT_ET_MODEL_NAME}.pte"
331331
ls -lh "${OUT_ET_MODEL_NAME}.pte"
332332
fi

.github/workflows/apple-perf.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -237,9 +237,9 @@ jobs:
237237
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
238238
model.use_kv_cache=true \
239239
model.dtype_override=fp32 \
240-
base.preq_embedding_quantize="8,0" \
240+
base.preq_embedding_quantize='8,0' \
241241
quantization.use_spin_quant=native \
242-
base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
242+
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
243243
ls -lh "${OUT_ET_MODEL_NAME}.pte"
244244
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
245245
# QAT + LoRA
@@ -258,7 +258,7 @@ jobs:
258258
base.use_lora=16 \
259259
base.preq_mode="8da4w_output_8da8w" \
260260
base.preq_group_size=32 \
261-
base.preq_embedding_quantize="8,0" \
261+
base.preq_embedding_quantize='8,0' \
262262
model.use_sdpa_with_kv_cache=true \
263263
model.use_kv_cache=true \
264264
backend.xnnpack.enabled=true \
@@ -267,7 +267,7 @@ jobs:
267267
export.max_seq_length=2048 \
268268
export.max_context_length=2048 \
269269
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
270-
base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
270+
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
271271
ls -lh "${OUT_ET_MODEL_NAME}.pte"
272272
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
273273
# Original BF16 version, without any quantization
@@ -280,7 +280,7 @@ jobs:
280280
model.use_sdpa_with_kv_cache=true \
281281
backend.xnnpack.enabled=true \
282282
model.dtype_override=bf16 \
283-
base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
283+
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
284284
export.output_name="${OUT_ET_MODEL_NAME}.pte"
285285
ls -lh "${OUT_ET_MODEL_NAME}.pte"
286286
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
@@ -296,8 +296,8 @@ jobs:
296296
backend.xnnpack.extended_ops=true \
297297
quantization.qmode=8da4w \
298298
quantization.group_size=32 \
299-
quantization.embedding_quantize="8,0" \
300-
base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
299+
quantization.embedding_quantize='8,0' \
300+
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
301301
export.output_name="${OUT_ET_MODEL_NAME}.pte"
302302
ls -lh "${OUT_ET_MODEL_NAME}.pte"
303303
elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then
@@ -330,8 +330,8 @@ jobs:
330330
backend.xnnpack.extended_ops=true \
331331
quantization.qmode=8da4w \
332332
quantization.group_size=32 \
333-
quantization.embedding_quantize="8,0" \
334-
base.metadata="\{\"get_bos_id\":151644,\"get_eos_ids\":[151645]\}" \
333+
quantization.embedding_quantize='8,0' \
334+
base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \
335335
export.output_name="${OUT_ET_MODEL_NAME}.pte"
336336
ls -lh "${OUT_ET_MODEL_NAME}.pte"
337337
fi

examples/models/llama/config/llm_config.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
Configurations for exporting Llama.
1111
1212
Uses dataclasses, which integrate with OmegaConf and Hydra.
13+
14+
Note:
15+
- Hydra is a bit finnicky with string values that include quotations, please
16+
refer to https://hydra.cc/docs/1.2/advanced/override_grammar/basic/#quoted-values
17+
for more information.
1318
"""
1419

1520
import argparse
@@ -34,9 +39,9 @@ class ModelType(str, Enum):
3439
llama3_2_vision = "llama3_2_vision"
3540
static_llama = "static_llama"
3641
qwen2_5 = "qwen2_5"
37-
qwen3_0_6b = "qwen3-0_6b"
38-
qwen3_1_7b = "qwen3-1_7b"
39-
qwen3_4b = "qwen3-4b"
42+
qwen3_0_6b = "qwen3_0_6b"
43+
qwen3_1_7b = "qwen3_1_7b"
44+
qwen3_4b = "qwen3_4b"
4045
phi_4_mini = "phi_4_mini"
4146
smollm2 = "smollm2"
4247

@@ -71,7 +76,7 @@ class BaseConfig:
7176
checkpoint_dir: Path to directory containing sharded checkpoint files.
7277
tokenizer_path: Path to the tokenizer file.
7378
metadata: Json string containing metadata information.
74-
e.g. '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
79+
e.g. '"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
7580
use_lora: Rank of the LoRA, if set to 0 then this means no LoRA. For use with QAT.
7681
fairseq2: For legacy internal use cases, this is safe to ignore.
7782
preq_mode: Legacy option to specify how prequantized weights are loaded.

examples/models/llama/export_llama_lib.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,9 @@
104104
"llama3_2",
105105
"static_llama",
106106
"qwen2_5",
107-
"qwen3-0_6b",
108-
"qwen3-1_7b",
109-
"qwen3-4b",
107+
"qwen3_0_6b",
108+
"qwen3_1_7b",
109+
"qwen3_4b",
110110
"phi_4_mini",
111111
"smollm2",
112112
]
@@ -115,9 +115,9 @@
115115
"qwen2_5": "Qwen/Qwen2.5-1.5B",
116116
"phi_4_mini": "microsoft/Phi-4-mini-instruct",
117117
"smollm2": "HuggingFaceTB/SmolLM-135M",
118-
"qwen3-0_6b": "Qwen/Qwen3-0.6B",
119-
"qwen3-1_7b": "Qwen/Qwen3-1.7B",
120-
"qwen3-4b": "Qwen/Qwen3-4B",
118+
"qwen3_0_6b": "Qwen/Qwen3-0.6B",
119+
"qwen3_1_7b": "Qwen/Qwen3-1.7B",
120+
"qwen3_4b": "Qwen/Qwen3-4B",
121121
}
122122

123123

examples/models/qwen3/README.md

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Qwen 3 uses the same example code as our optimized Llama model, while the checkp
77

88
All commands for exporting and running Llama on various backends should also be applicable to Qwen 3, by swapping the following args:
99
```
10-
base.model_class=[qwen3-0_6b,qwen3-1_7b,qwen3-4b]
10+
base.model_class=[qwen3_0_6b,qwen3_1_7b,qwen3_4b]
1111
base.params=[examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json]
1212
```
1313

@@ -17,7 +17,7 @@ Here is a basic example for exporting Qwen 3, although please refer to the Llama
1717
Export 0.6b to XNNPack, quantized with 8da4w:
1818
```
1919
python -m extension.llm.export.export_llm \
20-
base.model_class="qwen3-0_6b" \
20+
base.model_class="qwen3_0_6b" \
2121
base.params="examples/models/qwen3/0_6b_config.json" \
2222
model.use_kv_cache=True \
2323
model.use_sdpa_with_kv_cache=True \
@@ -26,14 +26,14 @@ python -m extension.llm.export.export_llm \
2626
backend.xnnpack.extended_ops=True \
2727
quantization.qmode="8da4w" \
2828
base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
29-
export.output_name="qwen3-0_6b.pte" \
29+
export.output_name="qwen3_0_6b.pte" \
3030
debug.verbose=True
3131
```
3232

3333
Export 1.7b to XNNPack, quantized with 8da4w:
3434
```
3535
python -m extension.llm.export.export_llm \
36-
base.model_class="qwen3-1_7b" \
36+
base.model_class="qwen3_1_7b" \
3737
base.params="examples/models/qwen3/1_7b_config.json" \
3838
model.use_kv_cache=True \
3939
model.use_sdpa_with_kv_cache=True \
@@ -42,14 +42,14 @@ python -m extension.llm.export.export_llm \
4242
backend.xnnpack.extended_ops=True \
4343
quantization.qmode="8da4w" \
4444
base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
45-
export.output_name="qwen3-1_7b.pte" \
45+
export.output_name="qwen3_1_7b.pte" \
4646
debug.verbose=True
4747
```
4848

4949
Export 4b to XNNPack, quantized with 8da4w:
5050
```
5151
python -m extension.llm.export.export_llm \
52-
base.model_class="qwen3-4b" \
52+
base.model_class="qwen3_4b" \
5353
base.params="examples/models/qwen3/4b_config.json" \
5454
model.use_kv_cache=True \
5555
model.use_sdpa_with_kv_cache=True \
@@ -58,16 +58,16 @@ python -m extension.llm.export.export_llm \
5858
backend.xnnpack.extended_ops=True \
5959
quantization.qmode="8da4w" \
6060
base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
61-
export.output_name="qwen3-4b.pte" \
61+
export.output_name="qwen3_4b.pte" \
6262
debug.verbose=True
6363
```
6464

6565
### Example run
6666
With ExecuTorch pybindings:
6767
```
6868
python -m examples.models.llama.runner.native
69-
--model qwen3-0_6b \
70-
--pte qwen3-0_6b.pte \
69+
--model qwen3_0_6b \
70+
--pte qwen3_0_6b.pte \
7171
--tokenizer ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json \
7272
--tokenizer_config ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer_config.json \
7373
--prompt "Who is the president of the US?" \
@@ -80,7 +80,7 @@ python -m examples.models.llama.runner.native
8080
With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner):
8181
```
8282
cmake-out/examples/models/llama/llama_main
83-
--model_path qwen3-0_6b.pte
83+
--model_path qwen3_0_6b.pte
8484
--tokenizer_path ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json
8585
--prompt="Who is the president of the US?"
8686
```

extension/llm/export/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ debug:
8585
### Export Qwen3 0.6B with XNNPACK backend and quantization
8686
```bash
8787
python -m extension.llm.export.export_llm \
88-
base.model_class=qwen3-0_6b \
88+
base.model_class=qwen3_0_6b \
8989
base.params=examples/models/qwen3/0_6b_config.json \
9090
base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
9191
model.use_kv_cache=true \

0 commit comments

Comments
 (0)