Use export_llm in CI

jackzhxng · jackzhxng · commit e6a94ef4f34a · 2025-06-20T16:01:12.000-07:00
ghstack-source-id: bd75cc2 ghstack-comment-id: 2993075817 Pull-Request: #11836
diff --git a/.ci/configs/README.md b/.ci/configs/README.md
@@ -0,0 +1,42 @@
+# CI Configuration Files for LLM Export
+
+This directory contains YAML configuration files used by CI tests for exporting LLM models with the new `extension.llm.export.export_llm` command.
+
+## Usage
+
+These config files can be used with the export command like this:
+
+```bash
+python -m extension.llm.export.export_llm --config path/to/config.yaml
+```
+
+Or you can override specific parameters:
+
+```bash
+python -m extension.llm.export.export_llm --config ci_stories110m_xnnpack_quantized.yaml base.checkpoint=my_checkpoint.pt
+```
+
+## Configuration Files
+
+### CI Test Configurations
+- `ci_stories110m_xnnpack_quantized.yaml` - Stories110M with XNNPACK quantization (used in test_llama.sh)
+- `ci_stories110m_mps.yaml` - Stories110M with MPS backend
+- `ci_stories110m_coreml.yaml` - Stories110M with CoreML backend  
+- `ci_stories110m_qnn.yaml` - Stories110M with QNN backend
+
+### Performance Test Configurations
+- `llama3_spinquant.yaml` - Llama3 with SpinQuant (used in apple-perf.yml, android-perf.yml)
+- `llama3_qlora.yaml` - Llama3 with QLoRA (QAT + LoRA)
+- `llama3_coreml_ane.yaml` - Llama3 with CoreML ANE
+- `xnnpack_8da4w_basic.yaml` - Basic XNNPACK 8da4w quantization
+- `qwen3_xnnpack_8da4w.yaml` - Qwen3 with XNNPACK 8da4w quantization
+
+### Specialized Configurations
+- `stories110m_torchao_lowbit.yaml` - Stories110M with TorchAO lowbit quantization
+- `xnnpack_custom_quantized.yaml` - XNNPACK with custom ops and quantization
+
+## Background
+
+These configuration files were created as part of migrating CI tests from the old `examples.models.llama.export_llama` command to the new `extension.llm.export.export_llm` command with hydra configuration support.
+
+The config files help reduce duplication in CI scripts and make it easier to maintain consistent export settings across different test scenarios.
diff --git a/.ci/configs/ci_stories110m_coreml.yaml b/.ci/configs/ci_stories110m_coreml.yaml
@@ -0,0 +1,20 @@
+# Configuration for CI test_llama.sh - stories110M with CoreML backend
+
+base:
+  model_class: "stories110m"
+
+model:
+  dtype_override: "fp32"
+  use_kv_cache: true
+  enable_dynamic_shape: false
+
+export:
+  max_seq_length: 128
+  max_context_length: 128
+
+backend:
+  coreml:
+    enabled: true
+
+debug:
+  verbose: true
diff --git a/.ci/configs/ci_stories110m_mps.yaml b/.ci/configs/ci_stories110m_mps.yaml
@@ -0,0 +1,20 @@
+# Configuration for CI test_llama.sh - stories110M with MPS backend
+
+base:
+  model_class: "stories110m"
+
+model:
+  dtype_override: "fp32"
+  use_kv_cache: true
+  enable_dynamic_shape: false
+
+export:
+  max_seq_length: 128
+  max_context_length: 128
+
+backend:
+  mps:
+    enabled: true
+
+debug:
+  verbose: true
diff --git a/.ci/configs/ci_stories110m_qnn.yaml b/.ci/configs/ci_stories110m_qnn.yaml
@@ -0,0 +1,28 @@
+# Configuration for CI test_llama.sh - stories110M with QNN backend
+
+base:
+  model_class: "stories110m"
+  tokenizer_path: "tokenizer.model"
+
+model:
+  dtype_override: "fp32"
+  use_kv_cache: true
+  enable_dynamic_shape: false
+
+export:
+  max_seq_length: 128
+  max_context_length: 128
+
+quantization:
+  pt2e_quantize: "qnn_16a16w"
+  calibration_tasks: ["wikitext"]
+  calibration_limit: 1
+  calibration_seq_length: 128
+  calibration_data: "Once"
+
+backend:
+  qnn:
+    enabled: true
+
+debug:
+  verbose: true
diff --git a/.ci/configs/ci_stories110m_xnnpack_quantized.yaml b/.ci/configs/ci_stories110m_xnnpack_quantized.yaml
@@ -0,0 +1,27 @@
+# Configuration for CI test_llama.sh - stories110M with XNNPACK quantization
+# Used when XNNPACK=ON, CUSTOM=ON, QE=ON modes are enabled
+
+base:
+  model_class: "stories110m"
+
+model:
+  dtype_override: "fp32"
+  use_kv_cache: true
+  use_sdpa_with_kv_cache: true
+
+export:
+  max_seq_length: 128
+  max_context_length: 128
+
+quantization:
+  qmode: "8da4w"
+  group_size: 128
+  embedding_quantize: "8,1024"
+
+backend:
+  xnnpack:
+    enabled: true
+    extended_ops: true
+
+debug:
+  verbose: false
diff --git a/.ci/configs/llama3_coreml_ane.yaml b/.ci/configs/llama3_coreml_ane.yaml
@@ -0,0 +1,27 @@
+# Configuration for Llama3 with CoreML ANE
+# Used in apple-perf.yml
+
+base:
+  model_class: "llama3_2"
+
+model:
+  dtype_override: "fp32"
+  use_kv_cache: true
+  enable_dynamic_shape: false
+
+export:
+  max_seq_length: 128
+  max_context_length: 128
+
+quantization:
+  embedding_quantize: "4,32"
+
+backend:
+  coreml:
+    enabled: true
+    ios: 18
+    quantize: "c4w"
+    compute_units: "cpu_and_ne"
+
+debug:
+  verbose: false
diff --git a/.ci/configs/llama3_qlora.yaml b/.ci/configs/llama3_qlora.yaml
@@ -0,0 +1,30 @@
+# Configuration for Llama3 with QLoRA (QAT + LoRA)
+# Used in apple-perf.yml and android-perf.yml
+
+base:
+  model_class: "llama3_2"
+  use_lora: 16
+  preq_mode: "8da4w_output_8da8w"
+  preq_group_size: 32
+  preq_embedding_quantize: "8,0"
+
+model:
+  dtype_override: "fp32"
+  use_kv_cache: true
+  use_sdpa_with_kv_cache: true
+  enable_dynamic_shape: false
+
+export:
+  max_seq_length: 2048
+  max_context_length: 2048
+
+quantization:
+  use_qat: true
+
+backend:
+  xnnpack:
+    enabled: true
+    extended_ops: true
+
+debug:
+  verbose: false
diff --git a/.ci/configs/llama3_spinquant.yaml b/.ci/configs/llama3_spinquant.yaml
@@ -0,0 +1,29 @@
+# Configuration for Llama3 with SpinQuant
+# Used in apple-perf.yml and android-perf.yml
+
+base:
+  model_class: "llama3_2"
+  preq_mode: "8da4w_output_8da8w"
+  preq_group_size: 32
+  preq_embedding_quantize: "8,0"
+
+model:
+  dtype_override: "fp32"
+  use_kv_cache: true
+  use_sdpa_with_kv_cache: true
+  enable_dynamic_shape: false
+
+export:
+  max_seq_length: 2048
+  max_context_length: 2048
+
+quantization:
+  use_spin_quant: "native"
+
+backend:
+  xnnpack:
+    enabled: true
+    extended_ops: true
+
+debug:
+  verbose: false
diff --git a/.ci/configs/qwen3_xnnpack_8da4w.yaml b/.ci/configs/qwen3_xnnpack_8da4w.yaml
@@ -0,0 +1,28 @@
+# Configuration for Qwen3-0.6B with XNNPACK 8da4w quantization
+# Used in apple-perf.yml and android-perf.yml
+
+base:
+  model_class: "qwen3-0_6b"
+  params: "examples/models/qwen3/0_6b_config.json"
+
+model:
+  dtype_override: "fp32"
+  use_kv_cache: true
+  use_sdpa_with_kv_cache: true
+
+export:
+  max_seq_length: 128
+  max_context_length: 128
+
+quantization:
+  qmode: "8da4w"
+  group_size: 32
+  embedding_quantize: "8,0"
+
+backend:
+  xnnpack:
+    enabled: true
+    extended_ops: true
+
+debug:
+  verbose: false
diff --git a/.ci/configs/stories110m_torchao_lowbit.yaml b/.ci/configs/stories110m_torchao_lowbit.yaml
@@ -0,0 +1,26 @@
+# Configuration for stories110M with TorchAO lowbit quantization
+# Used in CI test_llama_torchao_lowbit.sh
+
+base:
+  model_class: "stories110m"
+
+model:
+  dtype_override: "fp32"
+  use_kv_cache: true
+  use_sdpa_with_kv_cache: true
+
+export:
+  max_seq_length: 128
+  max_context_length: 128
+
+quantization:
+  qmode: "torchao:8da3w"  # QLINEAR_BITWIDTH=3
+  group_size: 128        # QLINEAR_GROUP_SIZE=128
+  embedding_quantize: "4,32"  # QEMBEDDING_BITWIDTH=4, QEMBEDDING_GROUP_SIZE=32
+
+backend:
+  xnnpack:
+    enabled: false
+
+debug:
+  verbose: false
diff --git a/.ci/configs/xnnpack_8da4w_basic.yaml b/.ci/configs/xnnpack_8da4w_basic.yaml
@@ -0,0 +1,27 @@
+# Configuration for basic XNNPACK 8da4w quantization
+# Used in apple-perf.yml and android-perf.yml
+
+base:
+  model_class: "llama3_2"
+
+model:
+  dtype_override: "fp32"
+  use_kv_cache: true
+  use_sdpa_with_kv_cache: true
+
+export:
+  max_seq_length: 128
+  max_context_length: 128
+
+quantization:
+  qmode: "8da4w"
+  group_size: 32
+  embedding_quantize: "8,0"
+
+backend:
+  xnnpack:
+    enabled: true
+    extended_ops: true
+
+debug:
+  verbose: false
diff --git a/.ci/configs/xnnpack_custom_quantized.yaml b/.ci/configs/xnnpack_custom_quantized.yaml
@@ -0,0 +1,27 @@
+# Configuration for XNNPACK + custom + quantization
+# Common pattern used in CI test_llama.sh
+
+base:
+  model_class: "stories110m"
+
+model:
+  dtype_override: "fp32"
+  use_kv_cache: true
+  use_sdpa_with_kv_cache: true
+
+export:
+  max_seq_length: 128
+  max_context_length: 128
+
+quantization:
+  qmode: "8da4w"
+  group_size: 128
+  embedding_quantize: "8,1024"  # Default from test_llama.sh QE mode
+
+backend:
+  xnnpack:
+    enabled: true
+    extended_ops: true
+
+debug:
+  verbose: false
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -224,34 +224,34 @@ fi
 # Export model.
 EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
 echo "Exporting ${EXPORTED_MODEL_NAME}"
-EXPORT_ARGS="-c ${CHECKPOINT_FILE_NAME} -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
+EXPORT_ARGS="base.checkpoint=${CHECKPOINT_FILE_NAME} base.params=${PARAMS} model.dtype_override=${DTYPE} export.output_name=${EXPORTED_MODEL_NAME} model.use_kv_cache=true"
 if [[ "${XNNPACK}" == "ON" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} -X --xnnpack-extended-ops -qmode 8da4w -G 128"
+  EXPORT_ARGS="${EXPORT_ARGS} backend.xnnpack.enabled=true backend.xnnpack.extended_ops=true quantization.qmode=8da4w quantization.group_size=128"
 fi
 if [[ "${CUSTOM}" == "ON" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
+  EXPORT_ARGS="${EXPORT_ARGS} model.use_sdpa_with_kv_cache=true"
 fi
 if [[ "${QE}" == "ON" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024"
+  EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=8,1024"
 fi
 if [[ "${MPS}" == "ON" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} -kv -v --mps --disable_dynamic_shape"
+  EXPORT_ARGS="${EXPORT_ARGS} backend.mps.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
 fi
 if [[ "${COREML}" == "ON" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} -kv -v --coreml --disable_dynamic_shape"
+  EXPORT_ARGS="${EXPORT_ARGS} backend.coreml.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
 fi
 if [[ "${QNN}" == "ON" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
+  EXPORT_ARGS="${EXPORT_ARGS} backend.qnn.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
   echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
   if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
-    EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
+    EXPORT_ARGS+=" base.tokenizer_path=tokenizer.model quantization.pt2e_quantize=qnn_16a16w quantization.calibration_tasks=[wikitext] quantization.calibration_limit=1 quantization.calibration_seq_length=128 quantization.calibration_data='Once '"
   fi
 fi
 if [[ "${QUANTIZE_KV_CACHE}" == "ON" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} --quantize_kv_cache"
+  EXPORT_ARGS="${EXPORT_ARGS} model.quantize_kv_cache=true"
 fi
 # Add dynamically linked library location
-$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
+$PYTHON_EXECUTABLE -m extension.llm.export.export_llm ${EXPORT_ARGS}
 
 # Create tokenizer.bin.
 echo "Creating tokenizer.bin"
diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml