From c6362fb23503a887119219cfbe962aa989796bc7 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Tue, 24 Jun 2025 11:57:41 -0700 Subject: [PATCH 1/7] Allow CLI overrides --- extension/llm/export/export_llm.py | 43 ++++++---- extension/llm/export/test/test_export_llm.py | 90 ++++++++++---------- 2 files changed, 74 insertions(+), 59 deletions(-) diff --git a/extension/llm/export/export_llm.py b/extension/llm/export/export_llm.py index e995b329f30..7abe7bf3e91 100644 --- a/extension/llm/export/export_llm.py +++ b/extension/llm/export/export_llm.py @@ -38,14 +38,19 @@ from executorch.examples.models.llama.config.llm_config import LlmConfig from executorch.examples.models.llama.export_llama_lib import export_llama from hydra.core.config_store import ConfigStore +from hydra.core.hydra_config import HydraConfig from omegaconf import OmegaConf cs = ConfigStore.instance() cs.store(name="llm_config", node=LlmConfig) +# Need this global variable to pass an llm_config from yaml +# into the hydra-wrapped main function. +llm_config_from_yaml = None + + def parse_config_arg() -> Tuple[str, List[Any]]: - """First parse out the arg for whether to use Hydra or the old CLI.""" parser = argparse.ArgumentParser(add_help=True) parser.add_argument("--config", type=str, help="Path to the LlmConfig file") args, remaining = parser.parse_known_args() @@ -65,28 +70,34 @@ def pop_config_arg() -> str: @hydra.main(version_base=None, config_name="llm_config") def hydra_main(llm_config: LlmConfig) -> None: - export_llama(OmegaConf.to_object(llm_config)) + global llm_config_from_yaml + + # Override the LlmConfig constructed from the provide yaml config file + # with the CLI overrides. + if llm_config_from_yaml: + # Get CLI overrides (excluding defaults list). + overrides_list: List[str] = list(HydraConfig.get().overrides.get("task", [])) + override_cfg = OmegaConf.from_dotlist(overrides_list) + merged_config = OmegaConf.merge(llm_config_from_yaml, override_cfg) + export_llama(merged_config) + else: + export_llama(OmegaConf.to_object(llm_config)) def main() -> None: + # First parse out the arg for whether to use Hydra or the old CLI. config, remaining_args = parse_config_arg() if config: - # Check if there are any remaining hydra CLI args when --config is specified - # This might change in the future to allow overriding config file values - if remaining_args: - raise ValueError( - "Cannot specify additional CLI arguments when using --config. " - f"Found: {remaining_args}. Use either --config file or hydra CLI args, not both." - ) - + global llm_config_from_yaml + # Pop out --config and its value so that they are not parsed by + # Hyra's main. config_file_path = pop_config_arg() default_llm_config = LlmConfig() - llm_config_from_file = OmegaConf.load(config_file_path) - # Override defaults with values specified in the .yaml provided by --config. - merged_llm_config = OmegaConf.merge(default_llm_config, llm_config_from_file) - export_llama(merged_llm_config) - else: - hydra_main() + # Construct the LlmConfig from the config yaml file. + default_llm_config = LlmConfig() + from_yaml = OmegaConf.load(config_file_path) + llm_config_from_yaml = OmegaConf.merge(default_llm_config, from_yaml) + hydra_main() if __name__ == "__main__": diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py index 7d17b7819d3..7ae98f97c5b 100644 --- a/extension/llm/export/test/test_export_llm.py +++ b/extension/llm/export/test/test_export_llm.py @@ -21,7 +21,7 @@ class TestExportLlm(unittest.TestCase): def test_parse_config_arg_with_config(self) -> None: """Test parse_config_arg when --config is provided.""" # Mock sys.argv to include --config - test_argv = ["script.py", "--config", "test_config.yaml", "extra", "args"] + test_argv = ["export_llm.py", "--config", "test_config.yaml", "extra", "args"] with patch.object(sys, "argv", test_argv): config_path, remaining = parse_config_arg() self.assertEqual(config_path, "test_config.yaml") @@ -29,7 +29,7 @@ def test_parse_config_arg_with_config(self) -> None: def test_parse_config_arg_without_config(self) -> None: """Test parse_config_arg when --config is not provided.""" - test_argv = ["script.py", "debug.verbose=True"] + test_argv = ["export_llm.py", "debug.verbose=True"] with patch.object(sys, "argv", test_argv): config_path, remaining = parse_config_arg() self.assertIsNone(config_path) @@ -37,11 +37,21 @@ def test_parse_config_arg_without_config(self) -> None: def test_pop_config_arg(self) -> None: """Test pop_config_arg removes --config and its value from sys.argv.""" - test_argv = ["script.py", "--config", "test_config.yaml", "other", "args"] + test_argv = ["export_llm.py", "--config", "test_config.yaml", "other", "args"] with patch.object(sys, "argv", test_argv): config_path = pop_config_arg() self.assertEqual(config_path, "test_config.yaml") - self.assertEqual(sys.argv, ["script.py", "other", "args"]) + self.assertEqual(sys.argv, ["export_llm.py", "other", "args"]) + + def test_with_cli_args(self) -> None: + """Test main function with only hydra CLI args.""" + test_argv = ["export_llm.py", "debug.verbose=True"] + with patch.object(sys, "argv", test_argv): + with patch( + "executorch.extension.llm.export.export_llm.hydra_main" + ) as mock_hydra: + main() + mock_hydra.assert_called_once() @patch("executorch.extension.llm.export.export_llm.export_llama") def test_with_config(self, mock_export_llama: MagicMock) -> None: @@ -70,7 +80,7 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None: config_file = f.name try: - test_argv = ["script.py", "--config", config_file] + test_argv = ["export_llm.py", "--config", config_file] with patch.object(sys, "argv", test_argv): main() @@ -99,54 +109,48 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None: finally: os.unlink(config_file) - def test_with_cli_args(self) -> None: - """Test main function with only hydra CLI args.""" - test_argv = ["script.py", "debug.verbose=True"] - with patch.object(sys, "argv", test_argv): - with patch( - "executorch.extension.llm.export.export_llm.hydra_main" - ) as mock_hydra: - main() - mock_hydra.assert_called_once() - - def test_config_with_cli_args_error(self) -> None: - """Test that --config rejects additional CLI arguments to prevent mixing approaches.""" + @patch("executorch.extension.llm.export.export_llm.export_llama") + def test_with_config_and_cli(self, mock_export_llama: MagicMock) -> None: + """Test main function with --config file and no hydra args.""" # Create a temporary config file with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - f.write("base:\n checkpoint: /path/to/checkpoint.pth") - config_file = f.name - - try: - test_argv = ["script.py", "--config", config_file, "debug.verbose=True"] - with patch.object(sys, "argv", test_argv): - with self.assertRaises(ValueError) as cm: - main() - - error_msg = str(cm.exception) - self.assertIn( - "Cannot specify additional CLI arguments when using --config", - error_msg, - ) - finally: - os.unlink(config_file) - - def test_config_rejects_multiple_cli_args(self) -> None: - """Test that --config rejects multiple CLI arguments (not just single ones).""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - f.write("export:\n max_seq_length: 128") + f.write( + """ +base: + model_class: llama2 +model: + dtype_override: fp16 +backend: + xnnpack: + enabled: False +""" + ) config_file = f.name try: test_argv = [ - "script.py", + "export_llm.py", "--config", config_file, - "debug.verbose=True", - "export.output_dir=/tmp", + "base.model_class=stories110m", + "backend.xnnpack.enabled=True", ] with patch.object(sys, "argv", test_argv): - with self.assertRaises(ValueError): - main() + main() + + # Verify export_llama was called with config + mock_export_llama.assert_called_once() + called_config = mock_export_llama.call_args[0][0] + self.assertEqual( + called_config["base"]["model_class"], "stories110m" + ) # Override from CLI. + self.assertEqual( + called_config["model"]["dtype_override"].value, "fp16" + ) # From yaml. + self.assertEqual( + called_config["backend"]["xnnpack"]["enabled"], + True, # Override from CLI. + ) finally: os.unlink(config_file) From 26078ae89e92a8575c18ff1edd07da8a772697f5 Mon Sep 17 00:00:00 2001 From: "Jack Zhang (aider)" <32371937+jackzhxng@users.noreply.github.com> Date: Tue, 24 Jun 2025 17:23:01 -0700 Subject: [PATCH 2/7] Try splitting config into path and name --- extension/llm/export/export_llm.py | 52 ++++++++++++++++-------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/extension/llm/export/export_llm.py b/extension/llm/export/export_llm.py index 7abe7bf3e91..73ce9fc0ad7 100644 --- a/extension/llm/export/export_llm.py +++ b/extension/llm/export/export_llm.py @@ -30,6 +30,7 @@ """ import argparse +import os import sys from typing import Any, List, Tuple @@ -45,11 +46,6 @@ cs.store(name="llm_config", node=LlmConfig) -# Need this global variable to pass an llm_config from yaml -# into the hydra-wrapped main function. -llm_config_from_yaml = None - - def parse_config_arg() -> Tuple[str, List[Any]]: parser = argparse.ArgumentParser(add_help=True) parser.add_argument("--config", type=str, help="Path to the LlmConfig file") @@ -61,6 +57,7 @@ def pop_config_arg() -> str: """ Removes '--config' and its value from sys.argv. Assumes --config is specified and argparse has already validated the args. + Returns the config file path. """ idx = sys.argv.index("--config") value = sys.argv[idx + 1] @@ -68,20 +65,28 @@ def pop_config_arg() -> str: return value -@hydra.main(version_base=None, config_name="llm_config") -def hydra_main(llm_config: LlmConfig) -> None: - global llm_config_from_yaml +def add_hydra_config_args(config_file_path: str) -> None: + """ + Breaks down the config file path into directory and filename, + resolves the directory to an absolute path, and adds the + --config_path and --config_name arguments to sys.argv. + """ + config_dir = os.path.dirname(config_file_path) + config_name = os.path.basename(config_file_path) + + # Resolve to absolute path + config_dir_abs = os.path.abspath(config_dir) + + # Add the hydra config arguments to sys.argv + sys.argv.extend(["--config-path", config_dir_abs, "--config-name", config_name]) + - # Override the LlmConfig constructed from the provide yaml config file - # with the CLI overrides. - if llm_config_from_yaml: - # Get CLI overrides (excluding defaults list). - overrides_list: List[str] = list(HydraConfig.get().overrides.get("task", [])) - override_cfg = OmegaConf.from_dotlist(overrides_list) - merged_config = OmegaConf.merge(llm_config_from_yaml, override_cfg) - export_llama(merged_config) - else: - export_llama(OmegaConf.to_object(llm_config)) +@hydra.main(version_base=None, config_name="llm_config", config_path=None) +def hydra_main(llm_config: LlmConfig) -> None: + structured = OmegaConf.structured(LlmConfig) + merged = OmegaConf.merge(structured, llm_config) + llm_config_obj = OmegaConf.to_object(merged) + export_llama(llm_config_obj) def main() -> None: @@ -90,13 +95,12 @@ def main() -> None: if config: global llm_config_from_yaml # Pop out --config and its value so that they are not parsed by - # Hyra's main. + # Hydra's main. config_file_path = pop_config_arg() - default_llm_config = LlmConfig() - # Construct the LlmConfig from the config yaml file. - default_llm_config = LlmConfig() - from_yaml = OmegaConf.load(config_file_path) - llm_config_from_yaml = OmegaConf.merge(default_llm_config, from_yaml) + + # Add hydra config_path and config_name arguments to sys.argv. + add_hydra_config_args(config_file_path) + hydra_main() From 4dfd43fe4d716a2c68f866cad6aa7083eef73adb Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Wed, 25 Jun 2025 14:21:06 -0700 Subject: [PATCH 3/7] Update README --- extension/llm/export/README.md | 60 +++++++----------------------- extension/llm/export/export_llm.py | 8 ++-- 2 files changed, 17 insertions(+), 51 deletions(-) diff --git a/extension/llm/export/README.md b/extension/llm/export/README.md index 96f36acc1b4..e97b9e10462 100644 --- a/extension/llm/export/README.md +++ b/extension/llm/export/README.md @@ -23,9 +23,9 @@ The LLM export process transforms a model from its original format to an optimiz ## Usage -The export API supports two configuration approaches: +The export API supports a Hydra-style CLI where you can you configure using yaml and also CLI args. -### Option 1: Hydra CLI Arguments +### Hydra CLI Arguments Use structured configuration arguments directly on the command line: @@ -41,7 +41,7 @@ python -m extension.llm.export.export_llm \ quantization.qmode=8da4w ``` -### Option 2: Configuration File +### Configuration File Create a YAML configuration file and reference it: @@ -78,53 +78,21 @@ debug: verbose: true ``` -**Important**: You cannot mix both approaches. Use either CLI arguments OR a config file, not both. +You can you also still provide additional overrides using the CLI args as well: -## Example Commands - -### Export Qwen3 0.6B with XNNPACK backend and quantization ```bash -python -m extension.llm.export.export_llm \ - base.model_class=qwen3_0_6b \ - base.params=examples/models/qwen3/0_6b_config.json \ - base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \ - model.use_kv_cache=true \ - model.use_sdpa_with_kv_cache=true \ - model.dtype_override=FP32 \ - export.max_seq_length=512 \ - export.output_name=qwen3_0_6b.pte \ - quantization.qmode=8da4w \ - backend.xnnpack.enabled=true \ - backend.xnnpack.extended_ops=true \ - debug.verbose=true +python -m extension.llm.export.export_llm + --config my_config.yaml + base.model_class="llama2" + +export.max_context_length=1024 ``` -### Export Phi-4-Mini with custom checkpoint -```bash -python -m extension.llm.export.export_llm \ - base.model_class=phi_4_mini \ - base.checkpoint=/path/to/phi4_checkpoint.pth \ - base.params=examples/models/phi-4-mini/config.json \ - base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \ - model.use_kv_cache=true \ - model.use_sdpa_with_kv_cache=true \ - export.max_seq_length=256 \ - export.output_name=phi4_mini.pte \ - backend.xnnpack.enabled=true \ - debug.verbose=true -``` +Note that if a config file is specified and you want to specify a CLI arg that is not in the config, you need to prepend with a `+`. You can read more about this in the Hydra [docs](https://hydra.cc/docs/advanced/override_grammar/basic/). -### Export with CoreML backend (iOS optimization) -```bash -python -m extension.llm.export.export_llm \ - base.model_class=llama3 \ - model.use_kv_cache=true \ - export.max_seq_length=128 \ - backend.coreml.enabled=true \ - backend.coreml.compute_units=ALL \ - quantization.pt2e_quantize=coreml_c4w \ - debug.verbose=true -``` + +## Example Commands + +Please refer to the docs for some of our example suported models ([Llama](https://github.com/pytorch/executorch/blob/main/examples/models/llama/README.md), [Qwen3](https://github.com/pytorch/executorch/tree/main/examples/models/qwen3/README.md), [Phi-4-mini](https://github.com/pytorch/executorch/tree/main/examples/models/phi_4_mini/README.md)). ## Configuration Options @@ -134,4 +102,4 @@ For a complete reference of all available configuration options, see the [LlmCon - [Llama Examples](../../../examples/models/llama/README.md) - Comprehensive Llama export guide - [LLM Runner](../runner/) - Running exported models -- [ExecuTorch Documentation](https://pytorch.org/executorch/) - Framework overview \ No newline at end of file +- [ExecuTorch Documentation](https://pytorch.org/executorch/) - Framework overview diff --git a/extension/llm/export/export_llm.py b/extension/llm/export/export_llm.py index 73ce9fc0ad7..e0467250a28 100644 --- a/extension/llm/export/export_llm.py +++ b/extension/llm/export/export_llm.py @@ -39,7 +39,6 @@ from executorch.examples.models.llama.config.llm_config import LlmConfig from executorch.examples.models.llama.export_llama_lib import export_llama from hydra.core.config_store import ConfigStore -from hydra.core.hydra_config import HydraConfig from omegaconf import OmegaConf cs = ConfigStore.instance() @@ -73,10 +72,10 @@ def add_hydra_config_args(config_file_path: str) -> None: """ config_dir = os.path.dirname(config_file_path) config_name = os.path.basename(config_file_path) - + # Resolve to absolute path config_dir_abs = os.path.abspath(config_dir) - + # Add the hydra config arguments to sys.argv sys.argv.extend(["--config-path", config_dir_abs, "--config-name", config_name]) @@ -93,11 +92,10 @@ def main() -> None: # First parse out the arg for whether to use Hydra or the old CLI. config, remaining_args = parse_config_arg() if config: - global llm_config_from_yaml # Pop out --config and its value so that they are not parsed by # Hydra's main. config_file_path = pop_config_arg() - + # Add hydra config_path and config_name arguments to sys.argv. add_hydra_config_args(config_file_path) From 06164b62a172ee738946842282bec8d22a526f67 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Wed, 25 Jun 2025 16:03:26 -0700 Subject: [PATCH 4/7] Fix test --- extension/llm/export/test/test_export_llm.py | 28 +++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py index 7ae98f97c5b..e6f7160d4af 100644 --- a/extension/llm/export/test/test_export_llm.py +++ b/extension/llm/export/test/test_export_llm.py @@ -88,23 +88,19 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None: mock_export_llama.assert_called_once() called_config = mock_export_llama.call_args[0][0] self.assertEqual( - called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json" + called_config.base.tokenizer_path, "/path/to/tokenizer.json" ) - self.assertEqual(called_config["base"]["model_class"], "llama2") - self.assertEqual(called_config["base"]["preq_mode"].value, "8da4w") - self.assertEqual(called_config["model"]["dtype_override"].value, "fp16") - self.assertEqual(called_config["export"]["max_seq_length"], 256) + self.assertEqual(called_config.base.model_class, "llama2") + self.assertEqual(called_config.base.preq_mode.value, "8da4w") + self.assertEqual(called_config.model.dtype_override.value, "fp16") + self.assertEqual(called_config.export.max_seq_length, 256) self.assertEqual( - called_config["quantization"]["pt2e_quantize"].value, "xnnpack_dynamic" + called_config.quantization.pt2e_quantize.value, "xnnpack_dynamic" ) + self.assertEqual(called_config.quantization.use_spin_quant.value, "cuda") + self.assertEqual(called_config.backend.coreml.quantize.value, "c4w") self.assertEqual( - called_config["quantization"]["use_spin_quant"].value, "cuda" - ) - self.assertEqual( - called_config["backend"]["coreml"]["quantize"].value, "c4w" - ) - self.assertEqual( - called_config["backend"]["coreml"]["compute_units"].value, "cpu_and_gpu" + called_config.backend.coreml.compute_units.value, "cpu_and_gpu" ) finally: os.unlink(config_file) @@ -142,13 +138,13 @@ def test_with_config_and_cli(self, mock_export_llama: MagicMock) -> None: mock_export_llama.assert_called_once() called_config = mock_export_llama.call_args[0][0] self.assertEqual( - called_config["base"]["model_class"], "stories110m" + called_config.base.model_class, "stories110m" ) # Override from CLI. self.assertEqual( - called_config["model"]["dtype_override"].value, "fp16" + called_config.model.dtype_override.value, "fp16" ) # From yaml. self.assertEqual( - called_config["backend"]["xnnpack"]["enabled"], + called_config.backend.xnnpack.enabled, True, # Override from CLI. ) finally: From 5f5c1319a4aca64ef27f64340a30723ee5ecc646 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Tue, 24 Jun 2025 15:29:00 -0700 Subject: [PATCH 5/7] Create LlmConfig yamls for documented export_llm use cases --- .ci/scripts/test_model.sh | 4 +- .github/workflows/android-perf.yml | 2 +- .github/workflows/apple-perf.yml | 2 +- .../deepseek-r1-distill-llama-8B/README.md | 9 +- .../config/deepseek_xnnpack_q8da4w.yaml | 16 ++++ examples/models/llama/README.md | 90 ++++++------------- examples/models/llama/config/llama_bf16.yaml | 7 ++ .../models/llama/config/llama_q8da4w.yaml | 11 +++ .../llama/config/llama_xnnpack_qat.yaml | 23 +++++ .../llama/config/llama_xnnpack_spinquant.yaml | 22 +++++ examples/models/phi_4_mini/README.md | 12 +-- .../phi_4_mini/{ => config}/config.json | 0 .../phi_4_mini/config/phi_4_mini_xnnpack.yaml | 12 +++ examples/models/qwen2_5/README.md | 9 +- .../qwen2_5/{ => config}/1_5b_config.json | 0 .../config/qwen2_5_xnnpack_q8da4w.yaml | 11 +++ examples/models/qwen3/README.md | 36 ++------ .../qwen3/{ => config}/0_6b_config.json | 0 .../qwen3/{ => config}/1_7b_config.json | 0 .../models/qwen3/{ => config}/4b_config.json | 0 .../qwen3/config/qwen3_xnnpack_q8da4w.yaml | 15 ++++ 21 files changed, 162 insertions(+), 119 deletions(-) create mode 100644 examples/models/deepseek-r1-distill-llama-8B/config/deepseek_xnnpack_q8da4w.yaml create mode 100644 examples/models/llama/config/llama_bf16.yaml create mode 100644 examples/models/llama/config/llama_q8da4w.yaml create mode 100644 examples/models/llama/config/llama_xnnpack_qat.yaml create mode 100644 examples/models/llama/config/llama_xnnpack_spinquant.yaml rename examples/models/phi_4_mini/{ => config}/config.json (100%) create mode 100644 examples/models/phi_4_mini/config/phi_4_mini_xnnpack.yaml rename examples/models/qwen2_5/{ => config}/1_5b_config.json (100%) create mode 100644 examples/models/qwen2_5/config/qwen2_5_xnnpack_q8da4w.yaml rename examples/models/qwen3/{ => config}/0_6b_config.json (100%) rename examples/models/qwen3/{ => config}/1_7b_config.json (100%) rename examples/models/qwen3/{ => config}/4b_config.json (100%) create mode 100644 examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index bbf879295ae..bc9bbb8bae0 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -102,7 +102,7 @@ test_model() { bash examples/models/llama/install_requirements.sh # Test export_llm script: python3 -m extension.llm.export.export_llm. # Use Llama random checkpoint with Qwen 2.5 1.5b model configuration. - "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json + "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/config/1_5b_config.json rm "./${MODEL_NAME}.pte" return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears. fi @@ -110,7 +110,7 @@ test_model() { # Install requirements for export_llama bash examples/models/llama/install_requirements.sh # Test export_llm script: python3 -m extension.llm.export.export_llm. - "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json + "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config/config.json run_portable_executor_runner rm "./${MODEL_NAME}.pte" return diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index a7c2b9ca14c..1b1f0f68a1a 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -317,7 +317,7 @@ jobs: DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") python -m extension.llm.export.export_llm \ base.model_class=qwen3_0_6b \ - base.params=examples/models/qwen3/0_6b_config.json \ + base.params=examples/models/qwen3/config/0_6b_config.json \ model.use_kv_cache=true \ model.use_sdpa_with_kv_cache=true \ model.dtype_override=fp32 \ diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 6b1666da642..3db5abbefbd 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -322,7 +322,7 @@ jobs: DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") ${CONDA_RUN} python -m extension.llm.export.export_llm \ base.model_class=qwen3_0_6b \ - base.params=examples/models/qwen3/0_6b_config.json \ + base.params=examples/models/qwen3/config/0_6b_config.json \ model.use_kv_cache=true \ model.use_sdpa_with_kv_cache=true \ model.dtype_override=fp32 \ diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md index f05dd9990a2..311ee52eb16 100644 --- a/examples/models/deepseek-r1-distill-llama-8B/README.md +++ b/examples/models/deepseek-r1-distill-llama-8B/README.md @@ -53,16 +53,9 @@ torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth") 5. Generate a PTE file for use with the Llama runner. ``` python -m extension.llm.export.export_llm \ + --config examples/models/deepseek-r1-distill-llama-8B/config/deepseek-r1-distill-llama-8B base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \ base.params=params.json \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - backend.xnnpack.enabled=True \ - quantization.qmode="8da4w" \ - quantization.group_size=128 \ - model.dtype_override="fp16" \ - base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ - quantization.embedding_quantize=\'4,32\' \ export.output_name="DeepSeek-R1-Distill-Llama-8B.pte" ``` diff --git a/examples/models/deepseek-r1-distill-llama-8B/config/deepseek_xnnpack_q8da4w.yaml b/examples/models/deepseek-r1-distill-llama-8B/config/deepseek_xnnpack_q8da4w.yaml new file mode 100644 index 00000000000..1da7c253d92 --- /dev/null +++ b/examples/models/deepseek-r1-distill-llama-8B/config/deepseek_xnnpack_q8da4w.yaml @@ -0,0 +1,16 @@ +base: + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + +model: + use_kv_cache: True + use_sdpa_with_kv_cache: True + dtype_override: fp16 + +backend: + xnnpack: + enabled: True + +quantization: + qmode: 8da4w + group_size: 128 + embedding_quantize: 4,32 diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index e555043c44d..fb4927177ee 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -168,14 +168,10 @@ LLAMA_CHECKPOINT=path/to/consolidated.00.pth LLAMA_PARAMS=path/to/params.json python -m extension.llm.export.export_llm \ + --config examples/models/llamaconfig/llama_bf16.yaml base.model_class="llama3_2" \ base.checkpoint="${LLAMA_CHECKPOINT:?}" \ base.params="${LLAMA_PARAMS:?}" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="bf16" \ - base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ - export.output_name="llama3_2.pte" ``` For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb). @@ -190,22 +186,10 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/consolidated.00.pth.pth LLAMA_PARAMS=path/to/spinquant/params.json python -m extension.llm.export.export_llm \ - base.model_class="llama3_2" \ - base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - base.params="${LLAMA_PARAMS:?}" \ - model.use_sdpa_with_kv_cache=True \ - backend.xnnpack.enabled=True \ - backend.xnnpack.extended_ops=True \ - base.preq_mode="8da4w_output_8da8w" \ - base.preq_group_size=32 \ - export.max_seq_length=2048 \ - export.max_context_length=2048 \ - export.output_name="llama3_2.pte" \ - model.use_kv_cache=True \ - model.dtype_override="fp32" \ - base.preq_embedding_quantize=\'8,0\' \ - quantization.use_spin_quant="native" \ - base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' + --config examples/models/llama/config/llama_xnnpack_spinquant.yaml + base.model_class="llama3_2" \ + base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ ``` For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb). @@ -219,23 +203,10 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/consolidated.00.pth.pth LLAMA_PARAMS=path/to/qlora/params.json python -m extension.llm.export.export_llm \ - base.model_class="llama3_2" \ - base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - base.params="${LLAMA_PARAMS:?}" \ - quantization.use_qat=True \ - base.use_lora=16 \ - base.preq_mode="8da4w_output_8da8w" \ - base.preq_group_size=32 \ - base.preq_embedding_quantize=\'8,0\' \ - model.use_sdpa_with_kv_cache=True \ - model.use_kv_cache=True \ - backend.xnnpack.enabled=True \ - backend.xnnpack.extended_ops=True \ - model.dtype_override="fp32" \ - export.max_seq_length=2048 \ - export.max_context_length=2048 \ - export.output_name="llama3_2.pte" \ - base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' + --config examples/models/llama/config/llama_xnnpack_qat.yaml + base.model_class="llama3_2" \ + base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + base.params="${LLAMA_PARAMS:?}" \ ``` For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb). @@ -246,20 +217,13 @@ You can export and run the original Llama 3 8B instruct model. 1. Llama 3 pretrained parameters can be downloaded from [Meta's official Llama 3 repository](https://github.com/meta-llama/llama3/). 2. Export model and generate `.pte` file - ``` - python -m extension.llm.export.export_llm \ - base.checkpoint= \ - base.params= \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - backend.xnnpack.enabled=True \ - quantization.qmode="8da4w" \ - quantization.group_size=128 \ - model.dtype_override="fp32" \ - base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \ - quantization.embedding_quantize=\'4,32\' \ - export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" - ``` +``` +python -m extension.llm.export.export_llm \ + --config examples/models/llama/config/llama_q8da4w.yaml + base.model_clas="llama3" + base.checkpoint= \ + base.params= \ +``` Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize=\'4,32\'` as shown above to further reduce the model size. @@ -276,20 +240,20 @@ You can export and run the original Llama 3 8B instruct model. Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the section of Common Issues and Mitigations below for solutions. 2. Build llama runner. - ``` - cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DBUILD_TESTING=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -Bcmake-out/examples/models/llama \ - examples/models/llama +``` +cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DBUILD_TESTING=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -Bcmake-out/examples/models/llama \ + examples/models/llama - cmake --build cmake-out/examples/models/llama -j16 --config Release - ``` +cmake --build cmake-out/examples/models/llama -j16 --config Release +``` 3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama/main.cpp#L18-L40). - ``` - cmake-out/examples/models/llama/llama_main --model_path= --tokenizer_path= --prompt= - ``` +``` +cmake-out/examples/models/llama/llama_main --model_path= --tokenizer_path= --prompt= +``` To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON` diff --git a/examples/models/llama/config/llama_bf16.yaml b/examples/models/llama/config/llama_bf16.yaml new file mode 100644 index 00000000000..8e89e8aa437 --- /dev/null +++ b/examples/models/llama/config/llama_bf16.yaml @@ -0,0 +1,7 @@ +base: + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + +model: + use_kv_cache: True + use_sdpa_with_kv_cache: True + dtype_override: bf16 \ No newline at end of file diff --git a/examples/models/llama/config/llama_q8da4w.yaml b/examples/models/llama/config/llama_q8da4w.yaml new file mode 100644 index 00000000000..476ae928c60 --- /dev/null +++ b/examples/models/llama/config/llama_q8da4w.yaml @@ -0,0 +1,11 @@ +base: + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + +model: + dtype_override: fp32 + +quantization: + qmode: 8da4w + group_size: 128 + embedding_quantize: 4,32 + \ No newline at end of file diff --git a/examples/models/llama/config/llama_xnnpack_qat.yaml b/examples/models/llama/config/llama_xnnpack_qat.yaml new file mode 100644 index 00000000000..2369ff1d279 --- /dev/null +++ b/examples/models/llama/config/llama_xnnpack_qat.yaml @@ -0,0 +1,23 @@ +base: + preq_mode: preq_8da4w_out_8da8w + preq_group_size: 32 + preq_embedding_quantize: 8,0 + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + use_lora: 16 + +model: + use_sdpa_with_kv_cache: True + use_kv_cache: True + dtype_override: fp32 + +export: + max_seq_length: 2048 + max_context_length: 2048 + +quantization: + use_qat: True + +backend: + xnnpack: + enabled: True + extended_ops: True \ No newline at end of file diff --git a/examples/models/llama/config/llama_xnnpack_spinquant.yaml b/examples/models/llama/config/llama_xnnpack_spinquant.yaml new file mode 100644 index 00000000000..441086d6f73 --- /dev/null +++ b/examples/models/llama/config/llama_xnnpack_spinquant.yaml @@ -0,0 +1,22 @@ +base: + preq_mode: preq_8da4w_out_8da8w + preq_group_size: 32 + preq_embedding_quantize: 8,0 + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + +model: + use_sdpa_with_kv_cache: True + use_kv_cache: True + dtype_override: fp32 + +export: + max_seq_length: 2048 + max_context_length: 2048 + +quantization: + use_spin_quant: native + +backend: + xnnpack: + enabled: True + extended_ops: True \ No newline at end of file diff --git a/examples/models/phi_4_mini/README.md b/examples/models/phi_4_mini/README.md index d168d54226e..e90a61a771b 100644 --- a/examples/models/phi_4_mini/README.md +++ b/examples/models/phi_4_mini/README.md @@ -8,7 +8,7 @@ Phi-4-mini uses the same example code as Llama, while the checkpoint, model para All commands for exporting and running Llama on various backends should also be applicable to Phi-4-mini, by swapping the following args: ``` base.model_class="phi_4_mini" -base.params="examples/models/phi-4-mini/config.json" +base.params="examples/models/phi-4-mini/config/config.json" base.checkpoint= ``` @@ -33,16 +33,10 @@ Export to XNNPack, no quantization: PHI_CHECKPOINT=path/to/checkpoint.pth python -m extension.llm.export.export_llm \ - base.model_class="phi_4_mini" \ + --config config/phi_4_mini_xnnpack.yaml base.checkpoint="${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \ - base.params="examples/models/phi-4-mini/config.json" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="fp32" \ - backend.xnnpack.enabled=True \ - base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \ + base.params="examples/models/phi-4-mini/config/config.json" \ export.output_name="phi-4-mini.pte" \ - debug.verbose=True ``` Run using the executor runner: diff --git a/examples/models/phi_4_mini/config.json b/examples/models/phi_4_mini/config/config.json similarity index 100% rename from examples/models/phi_4_mini/config.json rename to examples/models/phi_4_mini/config/config.json diff --git a/examples/models/phi_4_mini/config/phi_4_mini_xnnpack.yaml b/examples/models/phi_4_mini/config/phi_4_mini_xnnpack.yaml new file mode 100644 index 00000000000..9355bd99f64 --- /dev/null +++ b/examples/models/phi_4_mini/config/phi_4_mini_xnnpack.yaml @@ -0,0 +1,12 @@ +base: + model_class: phi_4_mini + metadata: '{"get_bos_id":151643, "get_eos_ids":[151643]}' + +model: + use_kv_cache: True + use_sdpa_with_kv_cache: True + dtype_override: fp32 + +backend: + xnnpack: + enabled: True \ No newline at end of file diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md index 57784169ece..c2c759880a7 100644 --- a/examples/models/qwen2_5/README.md +++ b/examples/models/qwen2_5/README.md @@ -8,7 +8,7 @@ Qwen 2.5 uses the same example code as Llama, while the checkpoint, model params All commands for exporting and running Llama on various backends should also be applicable to Qwen 2.5, by swapping the following args: ``` base.model_class="qwen2_5" -base.params="examples/models/qwen2_5/1_5b_config.json" +base.params="examples/models/qwen2_5/config/1_5b_config.json" base.checkpoint= ``` @@ -33,16 +33,11 @@ Export to XNNPack, no quantization: QWEN_CHECKPOINT=path/to/checkpoint.pth python -m extension.llm.export.export_llm \ + --config examples/models/qwen2_5/config/qwen2_5_xnnpack_q8da4w.yaml base.model_class="qwen2_5" \ base.checkpoint="${QWEN_CHECKPOINT:?}" \ base.params="examples/models/qwen2_5/1_5b_config.json" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="fp32" \ - backend.xnnpack.enabled=True \ - base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \ export.output_name="qwen2_5-1_5b.pte" \ - debug.verbose=True ``` Run using the executor runner: diff --git a/examples/models/qwen2_5/1_5b_config.json b/examples/models/qwen2_5/config/1_5b_config.json similarity index 100% rename from examples/models/qwen2_5/1_5b_config.json rename to examples/models/qwen2_5/config/1_5b_config.json diff --git a/examples/models/qwen2_5/config/qwen2_5_xnnpack_q8da4w.yaml b/examples/models/qwen2_5/config/qwen2_5_xnnpack_q8da4w.yaml new file mode 100644 index 00000000000..0e5c6f7624e --- /dev/null +++ b/examples/models/qwen2_5/config/qwen2_5_xnnpack_q8da4w.yaml @@ -0,0 +1,11 @@ +base: + metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' + +model: + use_kv_cache: True + use_sdpa_with_kv_cache: True + dtype_override: fp32 + +backend: + xnnpack: + enabled: True \ No newline at end of file diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md index e24d8da2637..9eb2f889363 100644 --- a/examples/models/qwen3/README.md +++ b/examples/models/qwen3/README.md @@ -8,7 +8,7 @@ Qwen 3 uses the same example code as our optimized Llama model, while the checkp All commands for exporting and running Llama on various backends should also be applicable to Qwen 3, by swapping the following args: ``` base.model_class=[qwen3_0_6b,qwen3_1_7b,qwen3_4b] -base.params=[examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json] +base.params=[examples/models/qwen3/config/0_6b_config.json,examples/models/qwen3/config/1_7b_config.json,examples/models/config/qwen3/4b_config.json] ``` ### Example export @@ -17,49 +17,29 @@ Here is a basic example for exporting Qwen 3, although please refer to the Llama Export 0.6b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ + --config examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml base.model_class="qwen3_0_6b" \ - base.params="examples/models/qwen3/0_6b_config.json" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="fp32" \ - backend.xnnpack.enabled=True \ - backend.xnnpack.extended_ops=True \ - quantization.qmode="8da4w" \ - base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ + base.params="examples/models/qwen3/config/0_6b_config.json" \ export.output_name="qwen3_0_6b.pte" \ - debug.verbose=True + ``` Export 1.7b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ + --config examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml base.model_class="qwen3_1_7b" \ - base.params="examples/models/qwen3/1_7b_config.json" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="fp32" \ - backend.xnnpack.enabled=True \ - backend.xnnpack.extended_ops=True \ - quantization.qmode="8da4w" \ - base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ + base.params="examples/models/qwen3/config/1_7b_config.json" \ export.output_name="qwen3_1_7b.pte" \ - debug.verbose=True ``` Export 4b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ + --config examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml base.model_class="qwen3_4b" \ - base.params="examples/models/qwen3/4b_config.json" \ - model.use_kv_cache=True \ - model.use_sdpa_with_kv_cache=True \ - model.dtype_override="fp32" \ - backend.xnnpack.enabled=True \ - backend.xnnpack.extended_ops=True \ - quantization.qmode="8da4w" \ - base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \ + base.params="examples/models/qwen3/config/4b_config.json" \ export.output_name="qwen3_4b.pte" \ - debug.verbose=True ``` ### Example run diff --git a/examples/models/qwen3/0_6b_config.json b/examples/models/qwen3/config/0_6b_config.json similarity index 100% rename from examples/models/qwen3/0_6b_config.json rename to examples/models/qwen3/config/0_6b_config.json diff --git a/examples/models/qwen3/1_7b_config.json b/examples/models/qwen3/config/1_7b_config.json similarity index 100% rename from examples/models/qwen3/1_7b_config.json rename to examples/models/qwen3/config/1_7b_config.json diff --git a/examples/models/qwen3/4b_config.json b/examples/models/qwen3/config/4b_config.json similarity index 100% rename from examples/models/qwen3/4b_config.json rename to examples/models/qwen3/config/4b_config.json diff --git a/examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml b/examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml new file mode 100644 index 00000000000..60292b1ecdc --- /dev/null +++ b/examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml @@ -0,0 +1,15 @@ +base: + metadata: '{"get_bos_id": 151644, "get_eos_ids":[151645]}' + +model: + use_kv_cache: True + use_sdpa_with_kv_cache: True + dtype_override: fp32 + +quantization: + qmode: 8da4w + +backend: + xnnpack: + enabled: True + extended_ops: True \ No newline at end of file From 9787a7ebdf83c3accb2ad947f8f136b7ddd264ca Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Wed, 25 Jun 2025 14:29:10 -0700 Subject: [PATCH 6/7] Update with + --- .../deepseek-r1-distill-llama-8B/README.md | 6 ++--- examples/models/llama/README.md | 24 +++++++++---------- examples/models/phi_4_mini/README.md | 6 ++--- examples/models/qwen2_5/README.md | 8 +++---- examples/models/qwen3/README.md | 18 +++++++------- 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md index 311ee52eb16..00397e9f60f 100644 --- a/examples/models/deepseek-r1-distill-llama-8B/README.md +++ b/examples/models/deepseek-r1-distill-llama-8B/README.md @@ -54,9 +54,9 @@ torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth") ``` python -m extension.llm.export.export_llm \ --config examples/models/deepseek-r1-distill-llama-8B/config/deepseek-r1-distill-llama-8B - base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \ - base.params=params.json \ - export.output_name="DeepSeek-R1-Distill-Llama-8B.pte" + +base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \ + +base.params=params.json \ + +export.output_name="DeepSeek-R1-Distill-Llama-8B.pte" ``` 6. Run the model on your desktop for validation or integrate with iOS/Android apps. Instructions for these are available in the Llama [README](../llama/README.md) starting at Step 3. diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index fb4927177ee..3a3434935aa 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -169,9 +169,9 @@ LLAMA_PARAMS=path/to/params.json python -m extension.llm.export.export_llm \ --config examples/models/llamaconfig/llama_bf16.yaml - base.model_class="llama3_2" \ - base.checkpoint="${LLAMA_CHECKPOINT:?}" \ - base.params="${LLAMA_PARAMS:?}" \ + +base.model_class="llama3_2" \ + +base.checkpoint="${LLAMA_CHECKPOINT:?}" \ + +base.params="${LLAMA_PARAMS:?}" \ ``` For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb). @@ -187,9 +187,9 @@ LLAMA_PARAMS=path/to/spinquant/params.json python -m extension.llm.export.export_llm \ --config examples/models/llama/config/llama_xnnpack_spinquant.yaml - base.model_class="llama3_2" \ - base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - base.params="${LLAMA_PARAMS:?}" \ + +base.model_class="llama3_2" \ + +base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + +base.params="${LLAMA_PARAMS:?}" \ ``` For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb). @@ -204,9 +204,9 @@ LLAMA_PARAMS=path/to/qlora/params.json python -m extension.llm.export.export_llm \ --config examples/models/llama/config/llama_xnnpack_qat.yaml - base.model_class="llama3_2" \ - base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - base.params="${LLAMA_PARAMS:?}" \ + +base.model_class="llama3_2" \ + +base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + +base.params="${LLAMA_PARAMS:?}" \ ``` For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb). @@ -220,9 +220,9 @@ You can export and run the original Llama 3 8B instruct model. ``` python -m extension.llm.export.export_llm \ --config examples/models/llama/config/llama_q8da4w.yaml - base.model_clas="llama3" - base.checkpoint= \ - base.params= \ + +base.model_clas="llama3" + +base.checkpoint= \ + +base.params= \ ``` Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize=\'4,32\'` as shown above to further reduce the model size. diff --git a/examples/models/phi_4_mini/README.md b/examples/models/phi_4_mini/README.md index e90a61a771b..8fb2f03ac4c 100644 --- a/examples/models/phi_4_mini/README.md +++ b/examples/models/phi_4_mini/README.md @@ -34,9 +34,9 @@ PHI_CHECKPOINT=path/to/checkpoint.pth python -m extension.llm.export.export_llm \ --config config/phi_4_mini_xnnpack.yaml - base.checkpoint="${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \ - base.params="examples/models/phi-4-mini/config/config.json" \ - export.output_name="phi-4-mini.pte" \ + +base.checkpoint="${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \ + +base.params="examples/models/phi-4-mini/config/config.json" \ + +export.output_name="phi-4-mini.pte" \ ``` Run using the executor runner: diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md index c2c759880a7..566a7a5c30b 100644 --- a/examples/models/qwen2_5/README.md +++ b/examples/models/qwen2_5/README.md @@ -34,10 +34,10 @@ QWEN_CHECKPOINT=path/to/checkpoint.pth python -m extension.llm.export.export_llm \ --config examples/models/qwen2_5/config/qwen2_5_xnnpack_q8da4w.yaml - base.model_class="qwen2_5" \ - base.checkpoint="${QWEN_CHECKPOINT:?}" \ - base.params="examples/models/qwen2_5/1_5b_config.json" \ - export.output_name="qwen2_5-1_5b.pte" \ + +base.model_class="qwen2_5" \ + +base.checkpoint="${QWEN_CHECKPOINT:?}" \ + +base.params="examples/models/qwen2_5/1_5b_config.json" \ + +export.output_name="qwen2_5-1_5b.pte" \ ``` Run using the executor runner: diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md index 9eb2f889363..d2d89db93c2 100644 --- a/examples/models/qwen3/README.md +++ b/examples/models/qwen3/README.md @@ -18,9 +18,9 @@ Export 0.6b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ --config examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml - base.model_class="qwen3_0_6b" \ - base.params="examples/models/qwen3/config/0_6b_config.json" \ - export.output_name="qwen3_0_6b.pte" \ + +base.model_class="qwen3_0_6b" \ + +base.params="examples/models/qwen3/config/0_6b_config.json" \ + +export.output_name="qwen3_0_6b.pte" \ ``` @@ -28,18 +28,18 @@ Export 1.7b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ --config examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml - base.model_class="qwen3_1_7b" \ - base.params="examples/models/qwen3/config/1_7b_config.json" \ - export.output_name="qwen3_1_7b.pte" \ + +base.model_class="qwen3_1_7b" \ + +base.params="examples/models/qwen3/config/1_7b_config.json" \ + +export.output_name="qwen3_1_7b.pte" \ ``` Export 4b to XNNPack, quantized with 8da4w: ``` python -m extension.llm.export.export_llm \ --config examples/models/qwen3/config/qwen3_xnnpack_q8da4w.yaml - base.model_class="qwen3_4b" \ - base.params="examples/models/qwen3/config/4b_config.json" \ - export.output_name="qwen3_4b.pte" \ + +base.model_class="qwen3_4b" \ + +base.params="examples/models/qwen3/config/4b_config.json" \ + +export.output_name="qwen3_4b.pte" \ ``` ### Example run From 261d9db487ca341ca56daae850f9c2ebe62d3ebc Mon Sep 17 00:00:00 2001 From: Jack <32371937+jackzhxng@users.noreply.github.com> Date: Wed, 25 Jun 2025 17:31:23 -0700 Subject: [PATCH 7/7] Update README.md --- examples/models/llama/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index f125daf7992..bbd2107ad74 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -187,9 +187,9 @@ LLAMA_PARAMS=path/to/spinquant/params.json python -m extension.llm.export.export_llm \ --config examples/models/llama/config/llama_xnnpack_spinquant.yaml - +base.model_class="llama3_2" \ - +base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - +base.params="${LLAMA_PARAMS:?}" \ + +base.model_class="llama3_2" \ + +base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + +base.params="${LLAMA_PARAMS:?}" ``` For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb). @@ -221,8 +221,8 @@ You can export and run the original Llama 3 8B instruct model. python -m extension.llm.export.export_llm \ --config examples/models/llama/config/llama_q8da4w.yaml +base.model_clas="llama3" - +base.checkpoint= \ - +base.params= \ + +base.checkpoint= \ + +base.params= ``` Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize=\'4,32\'` as shown above to further reduce the model size.