Merge branch 'main' into docs/jhelsby/new-contributor-guide-update

jhelsby · web-flow · commit 4328bb9b6db2 · 2025-04-11T21:42:19.000+02:00
diff --git a/backends/apple/coreml/README.md b/backends/apple/coreml/README.md
@@ -1,8 +1,7 @@
 # ExecuTorch Core ML Delegate
 
-
 This subtree contains the Core ML Delegate implementation for ExecuTorch.
-Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.
+Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.  To learn how to use the CoreML delegate, see the [documentation](https://github.com/pytorch/executorch/blob/main/docs/source/backends-coreml.md). 
 
 ## Layout
 - `compiler/` : Lowers a module to Core ML backend.
@@ -19,110 +18,6 @@ Core ML is an optimized framework for running machine learning models on Apple d
     - `workspace` : Xcode workspace for the runtime.
 - `third-party/`: External dependencies.
 
-## Partition and Delegation
-
-To delegate a Program to the **Core ML** backend, the client must call `to_backend` with the **CoreMLPartitioner**.
-
-```python
-import torch
-import executorch.exir
-
-from executorch.backends.apple.coreml.compiler import CoreMLBackend
-from executorch.backends.apple.coreml.partition import CoreMLPartitioner
-
-class Model(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.sin(x)
-
-source_model = Model()
-example_inputs = (torch.ones(1), )
-
-# Export the source model to Edge IR representation
-aten_program = torch.export.export(source_model, example_inputs)
-edge_program_manager = executorch.exir.to_edge(aten_program)
-
-# Delegate to Core ML backend
-delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
-
-# Serialize delegated program
-executorch_program = delegated_program_manager.to_executorch()
-with open("model.pte", "wb") as f:
-    f.write(executorch_program.buffer)
-```
-
-The module will be fully or partially delegated to **Core ML**, depending on whether all or part of ops are supported by the **Core ML** backend. User may force skip certain ops by `CoreMLPartitioner(skip_ops_for_coreml_delegation=...)`
-
-The `to_backend` implementation is a thin wrapper over [coremltools](https://apple.github.io/coremltools/docs-guides/), `coremltools` is responsible for converting an **ExportedProgram** to a **MLModel**. The converted **MLModel** data is saved, flattened, and returned as bytes to **ExecuTorch**.
-
-## Quantization
-
-To quantize a Program in a Core ML favored way, the client may utilize **CoreMLQuantizer**.
-
-```python
-import torch
-import executorch.exir
-
-from torch.export import export_for_training
-from torch.ao.quantization.quantize_pt2e import (
-    convert_pt2e,
-    prepare_pt2e,
-    prepare_qat_pt2e,
-)
-
-from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
-from coremltools.optimize.torch.quantization.quantization_config import (
-    LinearQuantizerConfig,
-    QuantizationScheme,
-)
-
-class Model(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.conv = torch.nn.Conv2d(
-            in_channels=3, out_channels=16, kernel_size=3, padding=1
-        )
-        self.relu = torch.nn.ReLU()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        a = self.conv(x)
-        return self.relu(a)
-
-source_model = Model()
-example_inputs = (torch.randn((1, 3, 256, 256)), )
-
-pre_autograd_aten_dialect = export_for_training(source_model, example_inputs).module()
-
-quantization_config = LinearQuantizerConfig.from_dict(
-    {
-        "global_config": {
-            "quantization_scheme": QuantizationScheme.symmetric,
-            "activation_dtype": torch.quint8,
-            "weight_dtype": torch.qint8,
-            "weight_per_channel": True,
-        }
-    }
-)
-quantizer = CoreMLQuantizer(quantization_config)
-
-# For post-training quantization, use `prepare_pt2e`
-# For quantization-aware trainin,g use `prepare_qat_pt2e`
-prepared_graph = prepare_pt2e(pre_autograd_aten_dialect, quantizer)
-
-prepared_graph(*example_inputs)
-converted_graph = convert_pt2e(prepared_graph)
-```
-
-The `converted_graph` is the quantized torch model, and can be delegated to **Core ML** similarly through **CoreMLPartitioner**
-
-## Runtime
-
-To execute a Core ML delegated program, the application must link to the `coremldelegate` library. Once linked there are no additional steps required, ExecuTorch when running the program would call the Core ML runtime to execute the Core ML delegated part of the program.
-
-Please follow the instructions described in the [Core ML setup](/backends/apple/coreml/setup.md) to link the `coremldelegate` library.
-
 ## Help & Improvements
 If you have problems or questions or have suggestions for ways to make
 implementation and testing better, please create an issue on [github](https://www.github.com/pytorch/executorch/issues).
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
@@ -31,11 +31,11 @@
     EdgeProgramManager,
     ExecutorchBackendConfig,
     ExecutorchProgramManager,
-    to_edge,
 )
 from executorch.exir.pass_base import PassResult
 from executorch.exir.passes import ToOutVarPass
 from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
+from executorch.exir.program._program import to_edge_with_preserved_ops
 from torch._inductor.decomposition import remove_decompositions
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
@@ -80,6 +80,7 @@ def convert_pt2(
         torch.ops.aten.layer_norm.default,
         torch.ops.aten.linear.default,
         torch.ops.aten.matmul.default,
+        torch.ops.aten.rms_norm.default,
     ]
     # Remove decompositions for the ops we want to keep
     # pyre-fixme[6]: For 1st argument expected `Dict[typing.Callable[..., typing.Any
@@ -201,9 +202,9 @@ def lower_ep_to_edge(
     """
     Lower an ExportedProgram to an EdgeProgramManager (in edge IR).
     """
-    # Call to_edge to convert the graph to edge IR.
+    # Call to_edge_with_preserved_ops to convert the graph to edge IR.
     # Note: dim_order is skipped (https://github.com/pytorch/executorch/issues/3704)
-    edge_prog_manager = to_edge(
+    edge_prog_manager = to_edge_with_preserved_ops(
         expo_program,
         compile_config=EdgeCompileConfig(
             _skip_dim_order=True,
@@ -216,9 +217,11 @@ def lower_ep_to_edge(
                 torch.ops.aten.linalg_vector_norm.default,
                 torch.ops.aten.unfold.default,
                 torch.ops.aten.angle.default,
+                torch.ops.aten.rms_norm.default,
             ],
         ),
         constant_methods=constant_methods,
+        preserve_ops=(torch.ops.aten.rms_norm.default,),
     )
 
     if dump_graphs:
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -139,7 +139,6 @@
     "int in_zero_point, bool channel_last=False) -> (Tensor out)"
 )
 lib.define("linalg_vector_norm(Tensor X) -> (Tensor Y)")
-lib.define("rms_norm(Tensor X, float eps, Tensor W) -> (Tensor Y)")
 lib.define(
     "transposed_im2row(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, "
     "int[2] output_padding, Tensor in_zero_point, bool channel_last=False) -> (Tensor out)"
@@ -211,9 +210,6 @@
     "fully_connected.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define("linalg_vector_norm.out(Tensor X, *, Tensor(a!) out) -> Tensor(a!)")
-lib.define(
-    "rms_norm.out(Tensor X, float eps, Tensor W, *, Tensor(a!) out) -> Tensor(a!)"
-)
 lib.define(
     "quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, "
     "Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
@@ -1640,13 +1640,26 @@ def placeholder(  # noqa: C901
                 else:
                     spec.extra_tensor_info.fully_qualified_name = fqn
                     spec.extra_tensor_info.location = TensorDataLocation.EXTERNAL
-            if self.emitter_state.emit_mutable_buffer_names and is_mutable_buffer:
-                if spec.extra_tensor_info is None:
-                    spec.extra_tensor_info = ExtraTensorInfo(
-                        fully_qualified_name=fqn, location=TensorDataLocation.SEGMENT
+
+            if is_mutable_buffer:
+                # Emit names if we are supposed to.
+                if self.emitter_state.emit_mutable_buffer_names:
+                    if spec.extra_tensor_info is None:
+                        spec.extra_tensor_info = ExtraTensorInfo(
+                            fully_qualified_name=fqn,
+                            location=TensorDataLocation.SEGMENT,
+                        )
+                    else:
+                        spec.extra_tensor_info.fully_qualified_name = fqn
+                # if We aren't emitting the name then it needs to be memory planned.
+                elif spec.mem_id is None or spec.mem_offset is None:
+                    raise InternalError(
+                        self._emit_node_specific_error(
+                            self.node,
+                            # [2:] to remove the b_ prefix buffers get
+                            f'Mutable buffer "{target[2:]}" must have a memory id and offset if we are emitting it without a name. Please either memory plan your mutable buffers or call to_executorch with config=ExecutorchBackendConfig(emit_mutable_buffer_names=True)',
+                        )
                     )
-                else:
-                    spec.extra_tensor_info.fully_qualified_name = fqn
 
             # From the fqn find the corresponding tensor
             real_tensor = None
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
@@ -1838,8 +1838,40 @@ def forward(self, x):
         ep = to_edge(ep)
         # Lower the graph to executorch.
         ep = ep.to_executorch(
-            config=ExecutorchBackendConfig(emit_mutable_buffer_names=True)
+            config=ExecutorchBackendConfig(
+                emit_mutable_buffer_names=True,
+                memory_planning_pass=MemoryPlanningPass(alloc_mutable_buffers=False),
+            )
         )
         for val in ep.executorch_program.execution_plan[0].values:
             if isinstance(val, Tensor) and val.extra_tensor_info:
                 self.assertEqual(val.extra_tensor_info.fully_qualified_name, "buffer")
+                self.assertEqual(val.allocation_info, None)
+
+    def test_emit_mutable_buffer_names_fails(self) -> None:
+        class Net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(2, 2)
+                self.register_buffer("buffer", torch.zeros(1, 2))
+
+            def forward(self, x):
+                self.buffer.add_(1)
+                return self.linear(x) + self.buffer
+
+        net = Net()
+
+        ep = export(net, (torch.randn(1, 2),), strict=True)
+        # Lower the graph to edge dialect.
+        ep = to_edge(ep)
+        # Lower the graph to executorch.
+        # Must emit mutable buffer names if we don't allocate mutable buffers
+        with self.assertRaises(InternalError):
+            ep.to_executorch(
+                config=ExecutorchBackendConfig(
+                    emit_mutable_buffer_names=False,
+                    memory_planning_pass=MemoryPlanningPass(
+                        alloc_mutable_buffers=False
+                    ),
+                )
+            )
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
@@ -44,12 +44,14 @@ def __init__(
         graph_module: torch.fx.GraphModule,
         alloc_graph_input: bool,
         alloc_graph_output: bool,
+        alloc_mutable_buffers: bool,
         graph_signature: Optional[ExportGraphSignature] = None,
     ) -> None:
         self.graph_module = graph_module
         self.graph_signature = graph_signature
         self.alloc_graph_input = alloc_graph_input
         self.alloc_graph_output = alloc_graph_output
+        self.alloc_mutable_buffers = alloc_mutable_buffers
 
     @classmethod
     def mem_obj_id_match(
@@ -149,6 +151,7 @@ def verify_storage_reuse(
                 ignore_const=True,
                 ignore_graph_input=not self.alloc_graph_input,
                 ignore_graph_output=not self.alloc_graph_output,
+                ignore_mutable_buffers=not self.alloc_mutable_buffers,
                 do_assertion=False,
                 ignore_out_var_node=False,
                 dedup=True,
@@ -374,6 +377,7 @@ def collect_specs_from_nodes(  # noqa: C901
     graph_signature: Optional[ExportGraphSignature] = None,
     ignore_graph_input: bool = False,
     ignore_graph_output: bool = False,
+    ignore_mutable_buffers: bool = False,
     ignore_const: bool = True,
     ignore_out_var_node: bool = True,
     dedup: bool = True,
@@ -414,6 +418,9 @@ def collect_specs_from_nodes(  # noqa: C901
         if _is_inplace_node(node):
             continue
 
+        if _is_mutable_buffer(node, graph_signature) and ignore_mutable_buffers:
+            continue
+
         if do_assertion:
             internal_assert(
                 node.op in ("placeholder", "output")
@@ -469,6 +476,7 @@ def update_all_tensors_lifetime(
     Set the lifetime for all the tensors encountered in the Fx graph.
     """
     specs = set()
+
     for node_idx, node in enumerate(graph_module.graph.nodes):
         for spec in collect_specs_from_nodes(
             filter_nodes(itertools.chain([node], node.args, node.kwargs.values())),
@@ -1053,6 +1061,7 @@ def apply_algo(
     graph_signature: Optional[ExportGraphSignature] = None,
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
+    alloc_mutable_buffers: bool = True,
 ) -> List[int]:
     """
     Recursively apply algo to graph_module and its submodules for control flow.
@@ -1065,19 +1074,18 @@ def apply_algo(
        storage with tensors in the outer module.
     TODO: make these optimizations once we have some baseline working.
     """
-
     # Extract the nodes and their lifespans from the graph_module
     # Difficult to just filter the list of specs returned by this due to
     # how we flag trainable weights.
     _ = update_all_tensors_lifetime(graph_module, graph_signature)
-
     # Filter specs based on alloc_graph_input and alloc_graph_output
     specs = collect_specs_from_nodes(
         graph_module.graph.nodes,
         graph_signature,
         do_assertion=False,
         ignore_graph_input=not alloc_graph_input,
         ignore_graph_output=not alloc_graph_output,
+        ignore_mutable_buffers=not alloc_mutable_buffers,
     )
 
     # Get extra padding for XNNPACK if needed
diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py
@@ -44,6 +44,7 @@ def __init__(
         allow_lifetime_and_storage_overlap: bool = False,
         alloc_graph_input: bool = True,
         alloc_graph_output: bool = True,
+        alloc_mutable_buffers: bool = True,
         alignment: int = ALIGNMENT,
     ) -> None:
         r"""
@@ -54,10 +55,11 @@ def __init__(
         """
         if memory_planning_algo is None:
             memory_planning_algo = MemoryPlanningAlgorithmSuite()
-        self.memory_planning_algo = memory_planning_algo
+        self.memory_planning_algo: Callable[..., List[int]] = memory_planning_algo
         self.allow_lifetime_and_storage_overlap = allow_lifetime_and_storage_overlap
         self.alloc_graph_input = alloc_graph_input
         self.alloc_graph_output = alloc_graph_output
+        self.alloc_mutable_buffers = alloc_mutable_buffers
         self.alignment = alignment
 
     def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
@@ -124,13 +126,15 @@ def run(
         # customized fields. Using the graph_module object to convey information across
         # passes/stages is quite natural and avoid yet another 'context' data structure
         # to do the job.
+
         _ = apply_algo(
-            self.memory_planning_algo,  # pyre-ignore[6]
+            self.memory_planning_algo,
             graph_module,
             self.alignment,
             graph_signature,
             self.alloc_graph_input,
             self.alloc_graph_output,
+            self.alloc_mutable_buffers,
         )
 
         # TODO: make the verifier do the work recursively to handle
@@ -139,6 +143,7 @@ def run(
             graph_module,
             self.alloc_graph_input,
             self.alloc_graph_output,
+            self.alloc_mutable_buffers,
             graph_signature,
         )
 
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py