pytorch
diff --git a/‎.ci/scripts/test_model.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_model.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/TARGETS‎
Lines changed: 24 additions & 5 deletions b/‎backends/apple/coreml/TARGETS‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 23 additions & 1 deletion b/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎backends/apple/coreml/recipes/coreml_recipe_provider.py‎
Lines changed: 21 additions & 3 deletions b/‎backends/apple/coreml/recipes/coreml_recipe_provider.py‎
Lines changed: 21 additions & 3 deletions
diff --git a/‎backends/apple/coreml/test/test_coreml_recipes.py‎
Lines changed: 0 additions & 25 deletions b/‎backends/apple/coreml/test/test_coreml_recipes.py‎
Lines changed: 0 additions & 25 deletions
diff --git a/‎backends/apple/coreml/test/test_torch_ops.py‎
Lines changed: 53 additions & 2 deletions b/‎backends/apple/coreml/test/test_torch_ops.py‎
Lines changed: 53 additions & 2 deletions
@@ -97,7 +97,7 @@ test_model() {
     bash examples/models/llava/install_requirements.sh
     STRICT="--no-strict"
   fi
-  if [[ "${MODEL_NAME}" == "qwen2_5" ]]; then
+  if [[ "${MODEL_NAME}" == "qwen2_5_1_5b" ]]; then
       # Install requirements for export_llama
       bash examples/models/llama/install_requirements.sh
       # Test export_llm script: python3 -m extension.llm.export.export_llm.
 
@@ -929,7 +929,14 @@ jobs:
         CMAKE_ARGS="-DEXECUTORCH_BUILD_VULKAN=ON" \
         .ci/scripts/setup-linux.sh --build-tool "cmake"
 
+        # Custom operator tests
         PYTHON_EXECUTABLE=python bash backends/vulkan/test/custom_ops/build_and_run.sh add
+        ./cmake-out/backends/vulkan/test/custom_ops/q8csw_linear
+        ./cmake-out/backends/vulkan/test/custom_ops/q8csw_conv2d
+
+        # Run e2e testing for selected operators. More operators will be tested via this
+        # route in the future.
+        python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*pt2e*"
 
   nxp-build-test:
     name: nxp-build-test
 
@@ -176,7 +176,7 @@ jobs:
           - model: phi_4_mini
             backend: portable
             runner: linux.arm64.m7g.4xlarge
-          - model: qwen2_5
+          - model: qwen2_5_1_5b
             backend: portable
             runner: linux.arm64.2xlarge
           - model: llama3_2_vision_encoder
@@ -823,10 +823,10 @@ jobs:
           --tsv_path ${TSV_PATH}
         echo "::endgroup::"
 
-  test-huggingface-transformers-coreml:
+  test-huggingface-transformers-macos:
     # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
     if: ${{ !github.event.pull_request.head.repo.fork }}
-    name: test-huggingface-transformers-coreml
+    name: test-huggingface-transformers-macos
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     permissions:
       id-token: write
@@ -844,10 +844,10 @@ jobs:
           # phi4-mini|xnnpack|--quantize,
           # smollm2-135m|xnnpack|--quantize,
           # smollm3-3b|xnnpack|--quantize,
+          # qwen3-1.7b|xnnpack|--quantize,
           # CoreML.
           llama3.2-1b|coreml_fp32_gpu|--quantize,
           qwen3-0.6b|coreml_fp32_gpu|--quantize,
-          qwen3-1.7b|xnnpack|--quantize,
           smollm2-135m|coreml_fp32_gpu|--quantize,
           olmo-1b|coreml_fp32_gpu|--quantize,
           bert|coreml_fp32_gpu|--quantize,
 
@@ -699,9 +699,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
       ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
       ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
   )
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental
-  )
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/csrc/cpu)
   unset(EXECUTORCH_INCLUDE_DIRS)
 
   executorch_target_link_options_shared_lib(torchao_ops_executorch)
 
@@ -52,7 +52,7 @@ To get started you can:
 
 - Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device
 - Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away
-- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), and [Llava](examples/models/llava/README.md)
+- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [Llava](examples/models/llava/README.md), [Voxtral](examples/models/voxtral/README.md), and [LFM2](examples/models/lfm2/README.md).
 
 ## Feedback and Engagement
 
 
@@ -61,16 +61,21 @@ runtime.python_library(
 )
 
 runtime.python_library(
-    name = "recipes",
-    srcs = glob([
-        "recipes/*.py",
-    ]),
+    name = "coreml_recipes",
+    srcs = [
+        "recipes/__init__.py",
+        "recipes/coreml_recipe_provider.py"
+    ],
     visibility = [
         "@EXECUTORCH_CLIENTS",
+        "//executorch/export/...",
     ],
     deps = [
         "fbsource//third-party/pypi/coremltools:coremltools",
+        ":coreml_recipe_types",
         ":backend",
+        ":partitioner",
+        ":quantizer",
         "//caffe2:torch",
         "//executorch/exir:lib",
         "//executorch/exir/backend:compile_spec_schema",
@@ -80,6 +85,20 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "coreml_recipe_types",
+    srcs = [
+        "recipes/coreml_recipe_types.py",
+    ],
+    visibility = [
+        "@EXECUTORCH_CLIENTS",
+        "//executorch/export/...",
+    ],
+    deps = [
+        "//executorch/export:recipe",
+    ],
+)
+
 runtime.cxx_python_extension(
     name = "executorchcoreml",
     srcs = [
@@ -124,7 +143,7 @@ runtime.python_test(
         "fbsource//third-party/pypi/pytest:pytest",
         ":partitioner",
         ":quantizer",
-        ":recipes",
+        ":coreml_recipes",
         "//caffe2:torch",
         "//pytorch/vision:torchvision",
         "fbsource//third-party/pypi/scikit-learn:scikit-learn",
 
@@ -15,6 +15,7 @@
 from coremltools.converters.mil.frontend.torch.ops import (
     _get_inputs,
     _get_kwinputs,
+    noop,
     NUM_TO_NUMPY_DTYPE,
     NUM_TO_TORCH_DTYPE,
     split,
@@ -91,6 +92,28 @@ def _to_dim_order_copy(context, node):
         to(context, node)
 
 
+@register_torch_op(
+    torch_alias=[
+        "dim_order_ops::_clone_dim_order",
+        "dim_order_ops._clone_dim_order",
+    ],
+    override=False,
+)
+def _clone_dim_order(context, node):
+    dim_order = _get_kwinputs(context, node, "dim_order", default=[None])[0]
+    node.kwinputs.pop("dim_order")
+
+    # In CoreML, dim_order.val will be a ndarray, so we convert it to a list to check memory format.
+    dim_order = [int(d) for d in dim_order.val]
+    memory_format = get_memory_format(dim_order)
+    assert (
+        memory_format == _torch.contiguous_format
+    ), "Only contiguous memory format is supported in CoreML"
+
+    # Since CoreML only supports contiguous format, no dim_order preservation is needed. Treat this as a no-op clone.
+    noop(context, node)
+
+
 # https://github.com/apple/coremltools/pull/2558
 @register_torch_op(
     torch_alias=["torchao::dequantize_affine", "torchao.dequantize_affine"],
@@ -152,7 +175,6 @@ def dequantize_affine(context, node):
         int_data.astype(quantized_np_dtype),
         zero_point,
         scale,
-        axis=-1,
         name=node.name,
     )
     context.add(output, node.name)
 
@@ -3,6 +3,7 @@
 # Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 
+import logging
 from typing import Any, Optional, Sequence
 
 import coremltools as ct
@@ -111,8 +112,9 @@ def _validate_recipe_kwargs(self, recipe_type: RecipeType, **kwargs: Any) -> Non
 
         unexpected = set(kwargs.keys()) - expected_keys
         if unexpected:
-            raise ValueError(
-                f"Recipe '{recipe_type.value}' received unexpected parameters: {list(unexpected)}"
+            logging.warning(
+                f"CoreML recipe '{recipe_type.value}' ignoring unexpected parameters: {list(unexpected)}. "
+                f"Expected parameters: {list(expected_keys)}"
             )
 
         self._validate_base_parameters(kwargs)
@@ -121,7 +123,13 @@ def _validate_recipe_kwargs(self, recipe_type: RecipeType, **kwargs: Any) -> Non
 
     def _get_expected_keys(self, recipe_type: RecipeType) -> set:
         """Get expected parameter keys for a recipe type"""
-        common_keys = {"minimum_deployment_target", "compute_unit"}
+        common_keys = {
+            "minimum_deployment_target",
+            "compute_unit",
+            "skip_ops_for_coreml_delegation",
+            "lower_full_graph",
+            "take_over_constant_data",
+        }
 
         if recipe_type in [
             CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP,
@@ -377,9 +385,19 @@ def _get_coreml_lowering_recipe(
         if minimum_deployment_target and minimum_deployment_target < ct.target.iOS18:
             take_over_mutable_buffer = False
 
+        # Extract additional partitioner parameters
+        skip_ops_for_coreml_delegation = kwargs.get(
+            "skip_ops_for_coreml_delegation", None
+        )
+        lower_full_graph = kwargs.get("lower_full_graph", False)
+        take_over_constant_data = kwargs.get("take_over_constant_data", True)
+
         partitioner = CoreMLPartitioner(
             compile_specs=compile_specs,
             take_over_mutable_buffer=take_over_mutable_buffer,
+            skip_ops_for_coreml_delegation=skip_ops_for_coreml_delegation,
+            lower_full_graph=lower_full_graph,
+            take_over_constant_data=take_over_constant_data,
         )
 
         edge_compile_config = EdgeCompileConfig(
 
@@ -185,14 +185,6 @@ def test_int4_weight_only_per_group_validation(self):
             )
         self.assertIn("must be positive", str(cm.exception))
 
-        # Test unexpected parameter
-        with self.assertRaises(ValueError) as cm:
-            self.provider.create_recipe(
-                CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL,
-                group_size=32,  # group_size not valid for per-channel
-            )
-        self.assertIn("unexpected parameters", str(cm.exception))
-
     def test_int8_weight_only_per_channel(self):
         """Test INT8 weight-only per-channel quantization"""
         model = TestHelperModules.TwoLinearModule().eval()
@@ -385,23 +377,6 @@ def forward(self, x):
         self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-2)
         self._compare_eager_unquantized_model_outputs(session, model, example_inputs)
 
-    def test_pt2e_recipes_parameter_rejection(self):
-        """Test that PT2E recipes reject TorchAO-specific parameters"""
-        # PT2E recipes should reject TorchAO-specific parameters
-        pt2e_recipes = [
-            CoreMLRecipeType.PT2E_INT8_STATIC,
-            CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY,
-        ]
-        torchao_params = ["filter_fn", "group_size", "bits", "block_size"]
-
-        for recipe_type in pt2e_recipes:
-            for param in torchao_params:
-                with self.subTest(recipe=recipe_type.value, param=param):
-                    kwargs = {param: "dummy_value"}
-                    with self.assertRaises(ValueError) as cm:
-                        self.provider.create_recipe(recipe_type, **kwargs)
-                    self.assertIn("unexpected parameters", str(cm.exception).lower())
-
     def test_filter_fn_comprehensive(self):
         """Comprehensive test for filter_fn parameter functionality"""
 
 
@@ -27,9 +27,9 @@
 class TestTorchOps(unittest.TestCase):
     edge_compile_config = executorch.exir.EdgeCompileConfig()
 
-    def _coreml_partitioner(self):
+    def _coreml_partitioner(self, *, minimum_deployment_target=ct.target.iOS18):
         compile_specs = CoreMLBackend.generate_compile_specs(
-            minimum_deployment_target=ct.target.iOS18
+            minimum_deployment_target=minimum_deployment_target
         )
         return CoreMLPartitioner(compile_specs=compile_specs)
 
@@ -158,6 +158,33 @@ def test_dequantize_affine_c8w_embedding_b4w_linear(self):
         et_prog = delegated_program.to_executorch()
         self._compare_outputs(et_prog, model, example_inputs)
 
+    def test_dequantize_affine_c8w_embedding_c8w_linear_ios16(self):
+        model, example_inputs = self._get_test_model()
+        quantize_(
+            model,
+            IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0)),
+            lambda m, fqn: isinstance(m, torch.nn.Embedding),
+        )
+        quantize_(
+            model,
+            IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0)),
+        )
+        ep = torch.export.export(model, example_inputs)
+        delegated_program = executorch.exir.to_edge_transform_and_lower(
+            ep,
+            partitioner=[
+                self._coreml_partitioner(minimum_deployment_target=ct.target.iOS16)
+            ],
+        )
+        for node in delegated_program.exported_program().graph.nodes:
+            if node.op == "call_function":
+                assert node.target.__name__ in [
+                    "executorch_call_delegate",
+                    "getitem",
+                ], f"Got unexpected node target after delegation: {node.target.__name__}"
+        et_prog = delegated_program.to_executorch()
+        self._compare_outputs(et_prog, model, example_inputs)
+
     def test_dequantize_codebook_linear_per_grouped_col(self):
         model, example_inputs = self._get_test_model()
         quantize_(
@@ -268,6 +295,28 @@ def test_dequantize_codebook_embedding_per_grouped_row(self):
         et_prog = delegated_program.to_executorch()
         self._compare_outputs(et_prog, model, example_inputs)
 
+    def test__clone_dim_order_contiguous(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.dim_order_ops._clone_dim_order(
+                    x, dim_order=[0, 1, 2, 3]
+                )
+
+        model, example_inputs = Model(), (torch.randn(1, 3, 8, 8),)
+        ep = torch.export.export(model, example_inputs)
+        delegated_program = executorch.exir.to_edge_transform_and_lower(
+            ep,
+            partitioner=[self._coreml_partitioner()],
+        )
+        for node in delegated_program.exported_program().graph.nodes:
+            if node.op == "call_function":
+                assert node.target.__name__ in [
+                    "executorch_call_delegate",
+                    "getitem",
+                ], f"Got unexpected node target after delegation: {node.target.__name__}"
+        et_prog = delegated_program.to_executorch()
+        self._compare_outputs(et_prog, model, example_inputs)
+
 
 if __name__ == "__main__":
     test_runner = TestTorchOps()
@@ -276,7 +325,9 @@ def test_dequantize_codebook_embedding_per_grouped_row(self):
     test_runner.test_dequantize_affine_c4w_embedding()
     test_runner.test_dequantize_affine_c4w_linear()
     test_runner.test_dequantize_affine_c8w_embedding_b4w_linear()
+    test_runner.test_dequantize_affine_c8w_embedding_c8w_linear_ios16()
     test_runner.test_dequantize_codebook_linear_per_grouped_col()
     test_runner.test_dequantize_codebook_linear_per_grouped_row()
     test_runner.test_dequantize_codebook_embedding_per_grouped_col()
     test_runner.test_dequantize_codebook_embedding_per_grouped_row()
+    test_runner.test__clone_dim_order_contiguous()