pytorch
diff --git a/‎CMakeLists.txt‎
Lines changed: 11 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎backends/apple/coreml/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/partition/coreml_partitioner.py‎
Lines changed: 15 additions & 1 deletion b/‎backends/apple/coreml/partition/coreml_partitioner.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj‎
Lines changed: 2 additions & 2 deletions b/‎backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/apple/coreml/test/test_coreml_partitioner.py‎
Lines changed: 46 additions & 0 deletions b/‎backends/apple/coreml/test/test_coreml_partitioner.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎backends/arm/CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions b/‎backends/arm/CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/arm/README.md‎
Lines changed: 12 additions & 11 deletions b/‎backends/arm/README.md‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎backends/arm/TARGETS‎
Lines changed: 4 additions & 1 deletion b/‎backends/arm/TARGETS‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 10 additions & 3 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎backends/arm/_passes/convert_full_like_to_full_pass.py‎
Lines changed: 33 additions & 0 deletions b/‎backends/arm/_passes/convert_full_like_to_full_pass.py‎
Lines changed: 33 additions & 0 deletions
@@ -186,6 +186,10 @@ option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
        OFF
 )
 
+option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension"
+       OFF
+)
+
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)
 
 option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
@@ -245,7 +249,7 @@ cmake_dependent_option(
 )
 
 if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
-  set(EXECUTORCH_BUILF_EXTENSION_DATA_LOADER ON)
+  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
@@ -348,6 +352,7 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
 endif()
 
 if(EXECUTORCH_BUILD_TESTS)
+  set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
   include(CTest)
 endif()
 
@@ -373,7 +378,7 @@ if(NOT "${_repo_dir_name}" STREQUAL "executorch")
       "fix for this restriction."
   )
 endif()
-set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type)
+set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type/c10)
 
 #
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -717,6 +722,10 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_LLM)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
@@ -134,7 +134,7 @@ target_include_directories(
   coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
 )
 target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)
-target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type)
+target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 target_compile_definitions(coremldelegate PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)
 target_link_libraries(coremldelegate PRIVATE executorch_core)
 
 
@@ -3,7 +3,7 @@
 # Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 import logging
-from typing import List, Optional
+from typing import Callable, List, Optional, Tuple
 
 import coremltools as ct
 
@@ -104,3 +104,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )
+
+    def ops_to_not_decompose(
+        self, ep: ExportedProgram
+    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        do_not_decompose = []
+        op_support = OperatorsSupportedForCoreMLBackend()
+        for node in ep.graph.nodes:
+            if (
+                node.op == "call_function"
+                and isinstance(node.target, torch._ops.OpOverload)
+                and op_support.is_node_supported(None, node)
+            ):
+                do_not_decompose.append(node.target)
+        return do_not_decompose, None
@@ -922,7 +922,7 @@
 					"$(SRCROOT)/../kvstore",
 					"$(SRCROOT)/../inmemoryfs",
 					"$(SRCROOT)/../include",
-					"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
+					"$(SRCROOT)/../include/executorch/runtime/core/portable_type/c10",
 					"$(SRCROOT)/../sdk",
 					"$(SRCROOT)/../util",
 					"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
@@ -954,7 +954,7 @@
 					"$(SRCROOT)/../kvstore",
 					"$(SRCROOT)/../inmemoryfs",
 					"$(SRCROOT)/../include",
-					"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
+					"$(SRCROOT)/../include/executorch/runtime/core/portable_type/c10",
 					"$(SRCROOT)/../sdk",
 					"$(SRCROOT)/../util",
 					"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
 
@@ -13,6 +13,7 @@
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from executorch.exir.backend.utils import format_delegated_graph
 
 
 class TestCoreMLPartitioner(unittest.TestCase):
@@ -79,6 +80,50 @@ def test_vit_skip_conv(self):
             "getitem",
         ]
 
+    def test_ops_to_not_decompose(self):
+        class Model(torch.nn.Module):
+            def forward(self, q, k, v, mask):
+                return torch.ops.aten.scaled_dot_product_attention.default(
+                    q, k, v, attn_mask=mask
+                )
+
+        model = Model()
+        model.eval()
+
+        batch_size = 1
+        n_heads = 12
+        seq_len = 1
+        max_seq_length = 32
+        embedding_dim = 16
+        q = torch.randn(batch_size, n_heads, seq_len, embedding_dim)
+        k = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
+        v = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
+        mask = torch.randn(seq_len, max_seq_length)
+        example_inputs = (q, k, v, mask)
+        ep = torch.export.export(model, example_inputs)
+        coreml_partitioner = CoreMLPartitioner()
+
+        # Using to_edge_transform_and_lower, we expect SDPA will be preserved and show up in delegated graph
+        edge_program_manager = executorch.exir.to_edge_transform_and_lower(
+            ep, partitioner=[coreml_partitioner]
+        )
+        self.assertTrue(
+            "executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
+            in format_delegated_graph(
+                edge_program_manager.exported_program().graph_module
+            )
+        )
+
+        # Using to_edge flow, we expect SDPA will be decomposed and not show up in delegated graph
+        edge_program_manager2 = executorch.exir.to_edge(ep)
+        edge_program_manager2.to_backend(coreml_partitioner)
+        self.assertTrue(
+            "executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
+            not in format_delegated_graph(
+                edge_program_manager2.exported_program().graph_module
+            )
+        )
+
     def test_buffer(self):
         embedding_dim = 3
         max_seq_len = 2
@@ -129,4 +174,5 @@ def forward(self, q, k_val, input_pos):
     test_runner = TestCoreMLPartitioner()
     test_runner.test_add_sub_skip_mm()
     test_runner.test_vit_skip_conv()
+    test_runner.test_ops_to_not_decompose()
     test_runner.test_buffer()
@@ -1,4 +1,4 @@
-# Copyright 2023 Arm Limited and/or its affiliates.
+# Copyright 2023, 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -14,15 +14,15 @@ endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
-set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type)
+set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
 # Third-party folder and Ethos-U driver inclued
 set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
 set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
 include_directories(${DRIVER_ETHOSU_INCLUDE_DIR})
 
-set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp
+set(_arm_baremetal_sources backends/arm/runtime/EthosUBackend.cpp
                            backends/arm/runtime/VelaBinStream.cpp
 )
 list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
 
@@ -15,7 +15,7 @@ ethos-u-vela compilation stack. which follows the fully AoT flow.
 ## Layout
 
 Export:
-- `arm_backend.py` - Main entrypoint for the ArmPartitioner and ArmBackend. For more information see the section on
+- `ethosu_backend.py` - Main entrypoint for the EthosUBackend. For more information see the section on
 [Arm Backend Architecture](#arm-backend-architecture). For examples of use see `executorch/examples/arm`.
 - `tosa_mapping.py` - utilities for mapping edge dialect to TOSA
 - `tosa_quant_utils.py` - utilities for mapping quantization information to TOSA encoding
@@ -29,11 +29,11 @@ Passes:
 - `*_pass.py` - Compiler passes derived from ExportPass
 
 Quantization:
-- `arm_quantizer.py` - Quantizer for Arm backend
+- `arm_quantizer.py` - Quantizers for Arm backend. Contains the EthosUQuantizer which inherits from the TOSAQuantizer
 - `arm_quantizer_utils.py` - Utilities for quantization
 
 Runtime:
-- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (BackendInterface) for Ethos-U
+- `runtime/ArmEthosUBackend.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (BackendInterface) for Ethos-U
 
 Other:
 - `third-party/` - Dependencies on other code - in particular the TOSA serialization_lib for compiling to TOSA and the ethos-u-core-driver for the bare-metal backend supporting Ethos-U
@@ -177,6 +177,7 @@ create an issue on [github](https://www.github.com/pytorch/executorch/issues).
 # Arm Backend Architecture
 
 The broad principle with the Arm backend implemention for ExecuTorch is to support multiple Arm devices and device configurations through a largely Homogeneous flow with maximal sharing of class logic.
+The EthosUBackend is currently the one user facing API that target the Ethos-U55 and Ethos-U85 hardware IP. It is using the TOSABackend under the hood to share code and functionality, but also to separate testing possibilities to the TOSA flow itself.
 
 In practice for compilation, this means that the flow goes via [Arm TOSA](https://www.mlplatform.org/tosa/tosa_spec.html) to produce a common IR and quantization behaviour compatible with our various IP, and typically, device-specific backends to further lower to a device specific binary which can happen ahead of time (within the Python development flow) or at runtime (during a JIT compilation stage).
 
@@ -185,22 +186,22 @@ In practice for the runtime, this means we will share common runtime backend fun
 
 ## Arm Backend Status and Maturity
 
-The Arm Backend should be considered a prototype quality at this point, likely subject to significant change and improvement, and with a limited coverage of functionality. We are actively developing this codebase.
+The Arm EthosU Backend should be considered a prototype quality at this point, likely subject to significant change and improvement, and with a limited coverage of functionality. We are actively developing this codebase.
 
 ## Current flows
 
-The ArmBackend has a two stage process,
-- Compile to TOSA to rationalise the graph into known hardware support profiles. Currently this is to v0.80 TOSA BI with specific concern to a subset which gives support on Ethos-U55, the target of the initial prototype efforts.
+The EthosUBackend has a two stage process,
+- Compile to TOSA to rationalise the graph into known hardware support profiles. Currently this is to v0.80 TOSA BI with specific concern to a subset which gives support on Ethos-U55 and Ethos-U85, the target of the initial prototype efforts. This calls into the TOSABackend.
 - Lower via the ethos-u-vela compilation flow which takes TOSA v0.80 as an input and produces a low level commandstream for the hardware which is then passed via the delegate to the ethos-u-core-driver for direct execution.
 
-The ArmPartitioner is currenly used to ensure the operations converted are Ethos-U compatible, but will be extended to offer spec-correct TOSA Base inference and TOSA Main Inference generation in future.
+The EthosUPartitioner is currenly used to ensure the operations converted are Ethos-U compatible, but will be extended to offer spec-correct TOSA Base inference and TOSA Main Inference generation in future.
+
+There is also a generic TOSABackend with accompanying TOSAPartitioner and TOSAQuantizer, which are used by the EthosUBackend and friends. The Arm TOSA Backend can be used by it's own to verify the lowering to the TOSA representation of the model (refer to the unit tests in backends/arm/test which uses the TOSA backend in the test suites).
 
 ### Controlling compilation
 
 It is possible to control the compilation flow to aid in development and debug of both networks and the code itself.
 
-Configuration of the ArmBackend export flow is controlled by CompileSpec information (essentially used as compilation flags) to determine which of these outputs is produced. In particular this allows for use of the tosa_reference_model to run intermediate output to check for correctness and quantization accuracy without a full loop via hardware implemntation.
-
-As this is in active development see the ArmBackend for accurate information on [compilation flags](https://github.com/pytorch/executorch/blob/29f6dc9353e90951ed3fae3c57ae416de0520067/backends/arm/arm_backend.py#L319-L324)
+Configuration of the EthosUBackend export flow is controlled by CompileSpec information (essentially used as compilation flags) to determine which of these outputs is produced. In particular this allows for use of the tosa_reference_model to run intermediate output to check for correctness and quantization accuracy without a full loop via hardware implemntation.
 
-You can also refer to the [example TOSA end-to-end code](/examples/arm/arm_tosa_e2e.py)
+As this is in active development see the EthosUBackend for accurate information on [compilation flags](https://github.com/pytorch/executorch/blob/29f6dc9353e90951ed3fae3c57ae416de0520067/backends/arm/arm_backend.py#L319-L324)
@@ -4,7 +4,10 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 python_library(
     name = "arm_partitioner",
     srcs = [
-        "arm_partitioner.py",
+        "ethosu_backend.py",
+        "ethosu_partitioner.py",
+        "tosa_backend.py",
+        "tosa_partitioner.py",
     ],
     typing = True,
     deps = [
 
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,6 +18,9 @@
 from executorch.backends.arm._passes.convert_expand_copy_to_repeat import (
     ConvertExpandCopyToRepeatPass,
 )
+from executorch.backends.arm._passes.convert_full_like_to_full_pass import (
+    ConvertFullLikeToFullPass,
+)
 from executorch.backends.arm._passes.convert_split_to_slice import (
     ConvertSplitToSlicePass,
 )
@@ -49,6 +52,7 @@
 from executorch.backends.arm._passes.fuse_quantized_activation_pass import (  # type: ignore[import-not-found]
     FuseQuantizedActivationPass,
 )
+from executorch.backends.arm._passes.insert_rescales_pass import InsertRescalePass
 from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.backends.arm._passes.keep_dims_false_to_squeeze_pass import (
     KeepDimsFalseToSqueezePass,
@@ -72,6 +76,7 @@
     UnsqueezeScalarPlaceholdersPass,
 )
 from executorch.backends.arm.tosa_specification import TosaSpecification
+
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
 from executorch.exir.pass_manager import PassManager
@@ -95,6 +100,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
         self.add_pass(ConvertMeanDimToAveragePoolPass())
+        self.add_pass(ConvertFullLikeToFullPass())
 
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
@@ -115,7 +121,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertSqueezesToViewPass())
 
         self.add_pass(AnnotateChannelsLastDimOrder())
-
+        self.add_pass(InsertRescalePass())
         return self._transform(exported_program.graph_module)
 
     def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
@@ -133,7 +139,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeSoftmaxesPass())
-
+        self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
@@ -153,6 +159,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertSqueezesToViewPass())
 
         self.add_pass(AnnotateChannelsLastDimOrder())
+        self.add_pass(InsertRescalePass())
 
         return self._transform(exported_program.graph_module)
 
 
@@ -0,0 +1,33 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class ConvertFullLikeToFullPass(ExportPass):
+    """As per the full_like pytorch documentation,
+    `torch.full_like(input, fill_value)` is equivalent to
+    `torch.full(input.size(),
+                fill_value,
+                dtype=input.dtype,
+                layout=input.layout,
+                device=input.device
+                )`
+    Skip layout and device since it's not relevant for our backend.
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in [
+            exir_ops.edge.aten.full_like.default,
+        ]:
+            return super().call_operator(op, args, kwargs, meta)
+
+        tensor = args[0].data
+        full_args = (list(tensor.shape), args[1])
+        full_kwargs = {"dtype": tensor.dtype}
+        return super().call_operator(
+            exir_ops.edge.aten.full.default, full_args, full_kwargs, meta
+        )
Original file line number	Diff line number	Diff line change
`@@ -134,7 +134,7 @@ target_include_directories(`
`134`	`134`	`coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util`
`135`	`135`	`)`
`136`	`136`	`target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)`
`137`		`-target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type)`
	`137`	`+target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)`
`138`	`138`	`target_compile_definitions(coremldelegate PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)`
`139`	`139`	`target_link_libraries(coremldelegate PRIVATE executorch_core)`
`140`	`140`