Update

jackzhxng · jackzhxng · commit 8f9faa2dc98f · 2025-06-23T11:40:03.000-07:00
[ghstack-poisoned]
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -228,9 +228,9 @@ jobs:
                       export.output_name="${OUT_ET_MODEL_NAME}.pte" \
                       model.use_kv_cache=true \
                       model.dtype_override=fp32 \
-                      base.preq_embedding_quantize="8,0" \
+                      base.preq_embedding_quantize='8,0' \
                       quantization.use_spin_quant=native \
-                      base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
+                      base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
                     # QAT + LoRA
@@ -249,7 +249,7 @@ jobs:
                       base.use_lora=16 \
                       base.preq_mode="8da4w_output_8da8w" \
                       base.preq_group_size=32 \
-                      base.preq_embedding_quantize="8,0" \
+                      base.preq_embedding_quantize='8,0' \
                       model.use_sdpa_with_kv_cache=true \
                       model.use_kv_cache=true \
                       backend.xnnpack.enabled=true \
@@ -258,7 +258,7 @@ jobs:
                       export.max_seq_length=2048 \
                       export.max_context_length=2048 \
                       export.output_name="${OUT_ET_MODEL_NAME}.pte" \
-                      base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
+                      base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
                     # Original BF16 version, without any quantization
@@ -271,7 +271,7 @@ jobs:
                       model.use_sdpa_with_kv_cache=true \
                       backend.xnnpack.enabled=true \
                       model.dtype_override=bf16 \
-                      base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
+                      base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
                       export.output_name="${OUT_ET_MODEL_NAME}.pte"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
@@ -287,8 +287,8 @@ jobs:
                       backend.xnnpack.extended_ops=true \
                       quantization.qmode=8da4w \
                       quantization.group_size=32 \
-                      quantization.embedding_quantize="8,0" \
-                      base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
+                      quantization.embedding_quantize='8,0' \
+                      base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
                       export.output_name="${OUT_ET_MODEL_NAME}.pte"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
@@ -325,8 +325,8 @@ jobs:
                   backend.xnnpack.extended_ops=true \
                   quantization.qmode=8da4w \
                   quantization.group_size=32 \
-                  quantization.embedding_quantize="8,0" \
-                  base.metadata="\{\"get_bos_id\":151644,\"get_eos_ids\":[151645]\}" \
+                  quantization.embedding_quantize='8,0' \
+                  base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \
                   export.output_name="${OUT_ET_MODEL_NAME}.pte"
                 ls -lh "${OUT_ET_MODEL_NAME}.pte"
               fi
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
@@ -237,9 +237,9 @@ jobs:
                 export.output_name="${OUT_ET_MODEL_NAME}.pte" \
                 model.use_kv_cache=true \
                 model.dtype_override=fp32 \
-                base.preq_embedding_quantize="8,0" \
+                base.preq_embedding_quantize='8,0' \
                 quantization.use_spin_quant=native \
-                base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
+                base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
               ls -lh "${OUT_ET_MODEL_NAME}.pte"
             elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
               # QAT + LoRA
@@ -258,7 +258,7 @@ jobs:
                 base.use_lora=16 \
                 base.preq_mode="8da4w_output_8da8w" \
                 base.preq_group_size=32 \
-                base.preq_embedding_quantize="8,0" \
+                base.preq_embedding_quantize='8,0' \
                 model.use_sdpa_with_kv_cache=true \
                 model.use_kv_cache=true \
                 backend.xnnpack.enabled=true \
@@ -267,7 +267,7 @@ jobs:
                 export.max_seq_length=2048 \
                 export.max_context_length=2048 \
                 export.output_name="${OUT_ET_MODEL_NAME}.pte" \
-                base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
+                base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
               ls -lh "${OUT_ET_MODEL_NAME}.pte"
             elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
               # Original BF16 version, without any quantization
@@ -280,7 +280,7 @@ jobs:
                 model.use_sdpa_with_kv_cache=true \
                 backend.xnnpack.enabled=true \
                 model.dtype_override=bf16 \
-                base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
+                base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
                 export.output_name="${OUT_ET_MODEL_NAME}.pte"
               ls -lh "${OUT_ET_MODEL_NAME}.pte"
             elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
@@ -296,8 +296,8 @@ jobs:
                 backend.xnnpack.extended_ops=true \
                 quantization.qmode=8da4w \
                 quantization.group_size=32 \
-                quantization.embedding_quantize="8,0" \
-                base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
+                quantization.embedding_quantize='8,0' \
+                base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
                 export.output_name="${OUT_ET_MODEL_NAME}.pte"
               ls -lh "${OUT_ET_MODEL_NAME}.pte"
             elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then
@@ -330,8 +330,8 @@ jobs:
                   backend.xnnpack.extended_ops=true \
                   quantization.qmode=8da4w \
                   quantization.group_size=32 \
-                  quantization.embedding_quantize="8,0" \
-                  base.metadata="\{\"get_bos_id\":151644,\"get_eos_ids\":[151645]\}" \
+                  quantization.embedding_quantize='8,0' \
+                  base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \
                   export.output_name="${OUT_ET_MODEL_NAME}.pte"
                 ls -lh "${OUT_ET_MODEL_NAME}.pte"
             fi
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
@@ -10,6 +10,11 @@
 Configurations for exporting Llama.
 
 Uses dataclasses, which integrate with OmegaConf and Hydra.
+
+Note:
+- Hydra is a bit finnicky with string values that include quotations, please
+refer to https://hydra.cc/docs/1.2/advanced/override_grammar/basic/#quoted-values
+for more information.
 """
 
 import argparse
@@ -34,9 +39,9 @@ class ModelType(str, Enum):
     llama3_2_vision = "llama3_2_vision"
     static_llama = "static_llama"
     qwen2_5 = "qwen2_5"
-    qwen3_0_6b = "qwen3-0_6b"
-    qwen3_1_7b = "qwen3-1_7b"
-    qwen3_4b = "qwen3-4b"
+    qwen3_0_6b = "qwen3_0_6b"
+    qwen3_1_7b = "qwen3_1_7b"
+    qwen3_4b = "qwen3_4b"
     phi_4_mini = "phi_4_mini"
     smollm2 = "smollm2"
 
@@ -71,7 +76,7 @@ class BaseConfig:
         checkpoint_dir: Path to directory containing sharded checkpoint files.
         tokenizer_path: Path to the tokenizer file.
         metadata: Json string containing metadata information.
-            e.g. '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+            e.g. '"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
         use_lora: Rank of the LoRA, if set to 0 then this means no LoRA. For use with QAT.
         fairseq2: For legacy internal use cases, this is safe to ignore.
         preq_mode: Legacy option to specify how prequantized weights are loaded.
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -104,9 +104,9 @@
     "llama3_2",
     "static_llama",
     "qwen2_5",
-    "qwen3-0_6b",
-    "qwen3-1_7b",
-    "qwen3-4b",
+    "qwen3_0_6b",
+    "qwen3_1_7b",
+    "qwen3_4b",
     "phi_4_mini",
     "smollm2",
 ]
@@ -115,9 +115,9 @@
     "qwen2_5": "Qwen/Qwen2.5-1.5B",
     "phi_4_mini": "microsoft/Phi-4-mini-instruct",
     "smollm2": "HuggingFaceTB/SmolLM-135M",
-    "qwen3-0_6b": "Qwen/Qwen3-0.6B",
-    "qwen3-1_7b": "Qwen/Qwen3-1.7B",
-    "qwen3-4b": "Qwen/Qwen3-4B",
+    "qwen3_0_6b": "Qwen/Qwen3-0.6B",
+    "qwen3_1_7b": "Qwen/Qwen3-1.7B",
+    "qwen3_4b": "Qwen/Qwen3-4B",
 }
 
 
diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md
@@ -7,7 +7,7 @@ Qwen 3 uses the same example code as our optimized Llama model, while the checkp
 
 All commands for exporting and running Llama on various backends should also be applicable to Qwen 3, by swapping the following args:
 ```
-base.model_class=[qwen3-0_6b,qwen3-1_7b,qwen3-4b]
+base.model_class=[qwen3_0_6b,qwen3_1_7b,qwen3_4b]
 base.params=[examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json]
 ```
 
@@ -17,7 +17,7 @@ Here is a basic example for exporting Qwen 3, although please refer to the Llama
 Export 0.6b to XNNPack, quantized with 8da4w:
 ```
 python -m extension.llm.export.export_llm \
-  base.model_class="qwen3-0_6b" \
+  base.model_class="qwen3_0_6b" \
   base.params="examples/models/qwen3/0_6b_config.json" \
   model.use_kv_cache=True \
   model.use_sdpa_with_kv_cache=True \
@@ -26,14 +26,14 @@ python -m extension.llm.export.export_llm \
   backend.xnnpack.extended_ops=True \
   quantization.qmode="8da4w" \
   base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
-  export.output_name="qwen3-0_6b.pte" \
+  export.output_name="qwen3_0_6b.pte" \
   debug.verbose=True
 ```
 
 Export 1.7b to XNNPack, quantized with 8da4w:
 ```
 python -m extension.llm.export.export_llm \
-  base.model_class="qwen3-1_7b" \
+  base.model_class="qwen3_1_7b" \
   base.params="examples/models/qwen3/1_7b_config.json" \
   model.use_kv_cache=True \
   model.use_sdpa_with_kv_cache=True \
@@ -42,14 +42,14 @@ python -m extension.llm.export.export_llm \
   backend.xnnpack.extended_ops=True \
   quantization.qmode="8da4w" \
   base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
-  export.output_name="qwen3-1_7b.pte" \
+  export.output_name="qwen3_1_7b.pte" \
   debug.verbose=True
 ```
 
 Export 4b to XNNPack, quantized with 8da4w:
 ```
 python -m extension.llm.export.export_llm \
-  base.model_class="qwen3-4b" \
+  base.model_class="qwen3_4b" \
   base.params="examples/models/qwen3/4b_config.json" \
   model.use_kv_cache=True \
   model.use_sdpa_with_kv_cache=True \
@@ -58,16 +58,16 @@ python -m extension.llm.export.export_llm \
   backend.xnnpack.extended_ops=True \
   quantization.qmode="8da4w" \
   base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
-  export.output_name="qwen3-4b.pte" \
+  export.output_name="qwen3_4b.pte" \
   debug.verbose=True
 ```
 
 ### Example run
 With ExecuTorch pybindings:
 ```
 python -m examples.models.llama.runner.native
-  --model qwen3-0_6b \
-  --pte qwen3-0_6b.pte \
+  --model qwen3_0_6b \
+  --pte qwen3_0_6b.pte \
   --tokenizer ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json \
   --tokenizer_config ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer_config.json \
   --prompt "Who is the president of the US?" \
@@ -80,7 +80,7 @@ python -m examples.models.llama.runner.native
 With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner):
 ```
 cmake-out/examples/models/llama/llama_main
-  --model_path qwen3-0_6b.pte
+  --model_path qwen3_0_6b.pte
   --tokenizer_path ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json
   --prompt="Who is the president of the US?"
 ```
diff --git a/extension/llm/export/README.md b/extension/llm/export/README.md
@@ -85,7 +85,7 @@ debug:
 ### Export Qwen3 0.6B with XNNPACK backend and quantization
 ```bash
 python -m extension.llm.export.export_llm \
-    base.model_class=qwen3-0_6b \
+    base.model_class=qwen3_0_6b \
     base.params=examples/models/qwen3/0_6b_config.json \
     base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
     model.use_kv_cache=true \