pytorch
diff --git a/‎.ci/scripts/test_llama_lora.sh‎
Lines changed: 51 additions & 14 deletions b/‎.ci/scripts/test_llama_lora.sh‎
Lines changed: 51 additions & 14 deletions
diff --git a/‎.github/workflows/build-presets.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-presets.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 30 additions & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎backends/apple/coreml/recipes/coreml_recipe_provider.py‎
Lines changed: 18 additions & 17 deletions b/‎backends/apple/coreml/recipes/coreml_recipe_provider.py‎
Lines changed: 18 additions & 17 deletions
diff --git a/‎backends/apple/coreml/recipes/coreml_recipe_types.py‎
Lines changed: 11 additions & 10 deletions b/‎backends/apple/coreml/recipes/coreml_recipe_types.py‎
Lines changed: 11 additions & 10 deletions
@@ -48,8 +48,17 @@ DOWNLOADED_PATH=$(
     --model_id "${HF_MODEL_REPO}" \
     --files "adapter_config.json" "adapter_model.pt" "consolidated.00.pth" "params.json" "tokenizer.model"
 )
-EXPORTED_MODEL_NAME="llama_3_2_1B_lora.pte"
-# Export model.
+# Build llama runner.
+cmake_install_executorch_libraries
+cmake_build_llama_runner
+
+# Constants.
+RUNTIME_ARGS="--tokenizer_path=${DOWNLOADED_PATH}/tokenizer.model --temperature=0 --seq_len=20 --warmup=1"
+PROMPT="What happens if you eat watermelon seeds?"
+EXPECTED_PREFIX="What happens if you eat watermelon seeds? Watermelon seeds are a good source of vitamin C,"
+
+# Export LoRA PTE file.
+MODEL_NAME="llama_3_2_1B_lora"
 $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
     base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
     base.params="${DOWNLOADED_PATH}/params.json" \
@@ -61,36 +70,64 @@ $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
     model.dtype_override="fp32" \
     backend.xnnpack.enabled=true \
     backend.xnnpack.extended_ops=true \
-    export.output_name="${EXPORTED_MODEL_NAME}"
-
-# Build llama runner.
-cmake_install_executorch_libraries
-cmake_build_llama_runner
+    export.output_name="${MODEL_NAME}.pte"
 
-PROMPT="What happens if you eat watermelon seeds?"
 # Run llama runner
-RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=${DOWNLOADED_PATH}/tokenizer.model --temperature=0 --seq_len=20 --warmup=1"
-
 NOW=$(date +"%H:%M:%S")
 echo "Starting to run llama runner at ${NOW}"
 # shellcheck source=/dev/null
-cmake-out/examples/models/llama/llama_main --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
+cmake-out/examples/models/llama/llama_main --model_path=${MODEL_NAME}.pte --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
 NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
 RESULT=$(cat result.txt)
-EXPECTED_PREFIX="What happens if you eat watermelon seeds? Watermelon seeds are a good source of vitamin C,"
-
 if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
   echo "Actual result: ${RESULT}"
+  # Do not clean up files if test passes, as they're re-used in the next test.
   echo "Success"
-  cleanup_files
 else
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Failure; results not the same"
+  cleanup_files
+  exit 1
+fi
 
+# Export LoRA PTE, PTD file.
+MODEL_SEPARATE="${MODEL_NAME}_separate"
+$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
+    base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+    base.params="${DOWNLOADED_PATH}/params.json" \
+    base.adapter_checkpoint="${DOWNLOADED_PATH}/adapter_model.pt" \
+    base.adapter_config="${DOWNLOADED_PATH}/adapter_config.json" \
+    base.tokenizer_path="${DOWNLOADED_PATH}/tokenizer.model" \
+    model.use_kv_cache=true \
+    model.use_sdpa_with_kv_cache=true \
+    model.dtype_override="fp32" \
+    backend.xnnpack.enabled=true \
+    backend.xnnpack.extended_ops=true \
+    export.output_name="${MODEL_SEPARATE}.pte" \
+    export.foundation_weights_file="${MODEL_SEPARATE}.ptd"
+
+# Run llama runner.
+NOW=$(date +"%H:%M:%S")
+echo "Starting to run llama runner at ${NOW}"
+# shellcheck source=/dev/null
+cmake-out/examples/models/llama/llama_main --model_path=${MODEL_SEPARATE}.pte --data_path=${MODEL_SEPARATE}.ptd --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
+NOW=$(date +"%H:%M:%S")
+echo "Finished at ${NOW}"
+
+RESULT2=$(cat result2.txt)
+if [[ "${RESULT2}" == "${EXPECTED_PREFIX}"* ]]; then
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT2}"
+  echo "Success"
+  cleanup_files
+else
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT2}"
+  echo "Failure; results not the same"
   cleanup_files
   exit 1
 fi
@@ -6,8 +6,6 @@ on:
     branches:
       - main
       - release/*
-    paths:
-      - .github/workflows/build-presets.yml
   workflow_dispatch:
 
 concurrency:
 
@@ -315,7 +315,7 @@ jobs:
         bash examples/models/moshi/mimi/install_requirements.sh
 
         # reinstall executorch
-        bash ./install_executorch.sh
+        bash ./install_executorch.sh --minimal
 
         # run python unittest
         python -m unittest examples.models.moshi.mimi.test_mimi
 
@@ -288,6 +288,7 @@ jobs:
           - test_arm_baremetal: test_models_tosa
           - test_arm_baremetal: test_models_ethos-u55
           - test_arm_baremetal: test_models_ethos-u85
+          - test_arm_baremetal: test_smaller_stories_llama
       fail-fast: false
     with:
       runner: linux.2xlarge.memory
 
@@ -136,6 +136,36 @@ init_command = [
     '--requirement=requirements-lintrunner.txt',
 ]
 
+[[linter]]
+code = 'CMAKEFORMAT'
+include_patterns = [
+    "**/*.cmake",
+    "**/*.cmake.in",
+    "**/CMakeLists.txt",
+]
+exclude_patterns = [
+    'third-party/**',
+    '**/third-party/**',
+]
+command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'cmake_format_linter',
+    '--',
+    '@{{PATHSFILE}}',
+]
+init_command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    '--requirement=requirements-lintrunner.txt',
+]
+
 [[linter]]
 code = 'ETCAPITAL'
 include_patterns = [
 
@@ -284,15 +284,19 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
   set(TORCHAO_BUILD_CPU_AARCH64 ON)
   set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
 
-  list(APPEND TORCHAO_INCLUDE_DIRS
+  list(
+    APPEND
+    TORCHAO_INCLUDE_DIRS
     ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
     ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
     ${EXECUTORCH_ROOT}/third-party/ao
   )
 
   set(EXECUTORCH_INCLUDE_DIRS ${TORCHAO_INCLUDE_DIRS})
 
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental)
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental
+  )
   executorch_target_link_options_shared_lib(torchao_ops_executorch)
   list(APPEND _executorch_kernels torchao_ops_executorch)
 endif()
 
@@ -59,11 +59,11 @@ def create_recipe(
             return self._build_fp_recipe(recipe_type, ct.precision.FLOAT32, **kwargs)
         elif recipe_type == CoreMLRecipeType.FP16:
             return self._build_fp_recipe(recipe_type, ct.precision.FLOAT16, **kwargs)
-        elif recipe_type == CoreMLRecipeType.INT8_STATIC:
+        elif recipe_type == CoreMLRecipeType.PT2E_INT8_STATIC:
             return self._build_pt2e_quantized_recipe(
                 recipe_type, activation_dtype=torch.quint8, **kwargs
             )
-        elif recipe_type == CoreMLRecipeType.INT8_WEIGHT_ONLY:
+        elif recipe_type == CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY:
             return self._build_pt2e_quantized_recipe(
                 recipe_type, activation_dtype=torch.float32, **kwargs
             )
@@ -201,6 +201,19 @@ def _validate_codebook_parameters(
                     f"Parameter 'block_size' must be a list, got {type(block_size).__name__}: {block_size}"
                 )
 
+    def _validate_and_set_deployment_target(
+        self, kwargs: Any, min_target: ct.target, quantization_type: str
+    ) -> None:
+        """Validate or set minimum deployment target for quantization recipes"""
+        minimum_deployment_target = kwargs.get("minimum_deployment_target", None)
+        if minimum_deployment_target and minimum_deployment_target < min_target:
+            raise ValueError(
+                f"minimum_deployment_target must be {str(min_target)} or higher for {quantization_type} quantization"
+            )
+        else:
+            # Default to the minimum target for this quantization type
+            kwargs["minimum_deployment_target"] = min_target
+
     def _build_fp_recipe(
         self,
         recipe_type: RecipeType,
@@ -227,13 +240,7 @@ def _build_pt2e_quantized_recipe(
         """Build PT2E-based quantization recipe"""
         from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
 
-        minimum_deployment_target = kwargs.get("minimum_deployment_target", None)
-        if minimum_deployment_target and minimum_deployment_target < ct.target.iOS17:
-            raise ValueError(
-                "minimum_deployment_target must be iOS17 or higher for codebook quantization"
-            )
-        # Default to iOS17 for  quantization
-        kwargs["minimum_deployment_target"] = ct.target.iOS17
+        self._validate_and_set_deployment_target(kwargs, ct.target.iOS17, "pt2e")
 
         # Validate activation_dtype
         assert activation_dtype in [
@@ -292,7 +299,7 @@ def _build_torchao_quantized_recipe(
         )
 
         # override minimum_deployment_target to ios18 for torchao (GH issue #13122)
-        kwargs["minimum_deployment_target"] = ct.target.iOS18
+        self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
         lowering_recipe = self._get_coreml_lowering_recipe(**kwargs)
 
         return ExportRecipe(
@@ -313,13 +320,7 @@ def _build_codebook_quantized_recipe(
             CodebookWeightOnlyConfig,
         )
 
-        minimum_deployment_target = kwargs.get("minimum_deployment_target", None)
-        if minimum_deployment_target and minimum_deployment_target < ct.target.iOS18:
-            raise ValueError(
-                "minimum_deployment_target must be iOS18 or higher for codebook quantization"
-            )
-        # Default to iOS18 for codebook quantization
-        kwargs["minimum_deployment_target"] = ct.target.iOS18
+        self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "codebook")
 
         # Get the appropriate dtype (torch.uint1 through torch.uint8)
         dtype = getattr(torch, f"uint{bits}")
 
@@ -12,7 +12,7 @@
 class CoreMLRecipeType(RecipeType):
     """CoreML-specific generic recipe types"""
 
-    # All the recipes accept common kwargs
+    ## All the recipes accept common kwargs
     # 1. minimum_deployment_unit (default: None)
     # 2. compute_unit (default: ct.ComputeUnit.ALL)
 
@@ -22,27 +22,28 @@ class CoreMLRecipeType(RecipeType):
     # FP16 precision recipe, defaults to values published by the CoreML backend and partitioner
     FP16 = "coreml_fp16"
 
-    # PT2E-based quantization recipes
+    ## PT2E-based quantization recipes
     # INT8 Static Quantization (weights + activations), requires calibration dataset
-    INT8_STATIC = "coreml_int8_static"
+    PT2E_INT8_STATIC = "coreml_pt2e_int8_static"
     # INT8 Weight-only Quantization (activations remain FP32)
-    INT8_WEIGHT_ONLY = "coreml_int8_weight_only"
+    PT2E_INT8_WEIGHT_ONLY = "coreml_pt2e_int8_weight_only"
 
-    # TorchAO-based quantization recipes
+    ## TorchAO-based quantization recipes
     # All TorchAO recipes accept filter_fn kwarg to control which layers are quantized
     # INT4 Weight-only Quantization, per-channel (axis=0)
-    # Additional kwargs: filter_fn (default: None - quantizes all applicable layers)
+    # Additional kwargs: filter_fn (default: None - quantizes linear layers)
     INT4_WEIGHT_ONLY_PER_CHANNEL = "coreml_int4_weight_only_per_channel"
     # INT4 Weight-only Quantization, per-group
-    # Additional kwargs: group_size (default: 32), filter_fn (default: None - quantizes all applicable layers)
+    # Additional kwargs: group_size (default: 32), filter_fn (default: None - quantizes linear layers)
     INT4_WEIGHT_ONLY_PER_GROUP = "coreml_int4_weight_only_per_group"
     # INT8 Weight-only Quantization, per-channel (axis=0)
-    # Additional kwargs: filter_fn (default: None - quantizes all applicable layers)
+    # Additional kwargs: filter_fn (default: None - quantizes linear layers)
     INT8_WEIGHT_ONLY_PER_CHANNEL = "coreml_int8_weight_only_per_channel"
     # INT8 Weight-only Quantization, per-group
-    # Additional kwargs: group_size (default: 32), filter_fn (default: None - quantizes all applicable layers)
+    # Additional kwargs: group_size (default: 32), filter_fn (default: None - quantizes linear layers)
     INT8_WEIGHT_ONLY_PER_GROUP = "coreml_int8_weight_only_per_group"
-    # Codebook/Palettization Quantization
+
+    ## Codebook/Palettization Quantization
     # Additional kwargs: bits (1-8, default: 3), block_size (default: [-1, 16]),
     # filter_fn (default: targets Linear and Embedding layers only)
     CODEBOOK_WEIGHT_ONLY = "coreml_codebook_weight_only"