pytorch
diff --git a/‎.github/workflows/build-presets.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build-presets.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakePresets.json‎
Lines changed: 20 additions & 0 deletions b/‎CMakePresets.json‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎backends/apple/coreml/TARGETS‎
Lines changed: 2 additions & 0 deletions b/‎backends/apple/coreml/TARGETS‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/apple/coreml/compiler/coreml_preprocess.py‎
Lines changed: 5 additions & 4 deletions b/‎backends/apple/coreml/compiler/coreml_preprocess.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 25 additions & 2 deletions b/‎backends/apple/coreml/compiler/torch_ops.py‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎backends/apple/coreml/logging.py‎
Lines changed: 24 additions & 0 deletions b/‎backends/apple/coreml/logging.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎backends/apple/coreml/partition/coreml_partitioner.py‎
Lines changed: 3 additions & 1 deletion b/‎backends/apple/coreml/partition/coreml_partitioner.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py‎
Lines changed: 18 additions & 10 deletions b/‎backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py‎
Lines changed: 18 additions & 10 deletions
diff --git a/‎backends/cadence/aot/compiler.py‎
Lines changed: 27 additions & 4 deletions b/‎backends/cadence/aot/compiler.py‎
Lines changed: 27 additions & 4 deletions
diff --git a/‎backends/cadence/aot/export_example.py‎
Lines changed: 1 addition & 5 deletions b/‎backends/cadence/aot/export_example.py‎
Lines changed: 1 addition & 5 deletions
@@ -20,7 +20,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        preset: [macos, ios, ios-simulator, pybind, llm]
+        preset: [macos, ios, ios-simulator, pybind, profiling, llm]
     with:
       job-name: build
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
 
@@ -100,6 +100,26 @@
             "list": ["Darwin", "Linux", "Windows"]
         }
     },
+    {
+        "name": "profiling",
+        "displayName": "Build ExecuTorch with Profiling Enabled",
+        "inherits": [
+            "common"
+        ],
+        "cacheVariables": {
+            "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/profiling.cmake",
+            "CMAKE_OSX_DEPLOYMENT_TARGET": "12.0"
+        },
+        "condition": {
+            "type": "inList",
+            "string": "${hostSystemName}",
+            "list": [
+                "Darwin",
+                "Linux",
+                "Windows"
+            ]
+        }
+    },
     {
         "name": "zephyr",
         "displayName": "Build ExecuTorch for Zephyr RTOS",
 
@@ -17,6 +17,7 @@ runtime.python_library(
     name = "backend",
     srcs = glob([
         "compiler/*.py",
+        "logging.py",
     ]),
     visibility = [
         "@EXECUTORCH_CLIENTS",
@@ -33,6 +34,7 @@ runtime.python_library(
     name = "partitioner",
     srcs = glob([
         "partition/*.py",
+        "logging.py",
     ]),
     visibility = [
         "@EXECUTORCH_CLIENTS",
 
@@ -16,20 +16,20 @@
 
 import coremltools as ct
 import coremltools.optimize as cto
-
 from executorch.backends.apple.coreml import executorchcoreml
+from executorch.backends.apple.coreml.logging import get_coreml_log_level
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     ExportedProgram,
     PreprocessResult,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
-
 from executorch.backends.apple.coreml.compiler.torch_ops import *  # noqa: F401, F403
 
+logger = logging.getLogger(__name__)
+logger.setLevel(get_coreml_log_level(default_level=logging.WARNING))
+
 
 class COMPILE_SPEC_KEYS(Enum):
     COMPUTE_UNITS = "compute_units"
@@ -409,6 +409,7 @@ def preprocess(
         edge_program: ExportedProgram,
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
+        logger.info(f"Edge program: {edge_program}")
         model_type: CoreMLBackend.MODEL_TYPE = (
             CoreMLBackend.model_type_from_compile_specs(
                 compile_specs,
 
@@ -9,13 +9,15 @@
 # the op to the coremltools library.
 
 import torch as _torch
-from coremltools import _logger as logger
+from coremltools import _logger
 from coremltools.converters.mil.frontend import _utils
 from coremltools.converters.mil.frontend.torch.ops import (
     _get_inputs,
+    _get_kwinputs,
     NUM_TO_NUMPY_DTYPE,
     NUM_TO_TORCH_DTYPE,
     split,
+    to,
     transpose,
     unbind,
 )
@@ -24,6 +26,7 @@
     register_torch_op,
 )
 from coremltools.converters.mil.mil import types
+from executorch.exir.dim_order_utils import get_memory_format
 
 
 # https://github.com/apple/coremltools/pull/2556
@@ -44,6 +47,26 @@ def split_copy(context, node):
     split(context, node)
 
 
+@register_torch_op(
+    torch_alias=[
+        "dim_order_ops::_to_dim_order_copy",
+        "dim_order_ops._to_dim_order_copy",
+    ],
+    override=False,
+)
+def _to_dim_order_copy(context, node):
+    dim_order = _get_kwinputs(context, node, "dim_order", default=[None])[0]
+    node.kwinputs.pop("dim_order")
+
+    # In CoreML, dim_order.val will be an ndarray, so we convert it to a list
+    dim_order = [int(d) for d in dim_order.val]
+    memory_format = get_memory_format(dim_order)
+    assert (
+        memory_format == _torch.contiguous_format
+    ), "Only contiguous memory format is supported in CoreML"
+    to(context, node)
+
+
 # https://github.com/apple/coremltools/pull/2558
 @register_torch_op(
     torch_alias=["torchao::dequantize_affine", "torchao.dequantize_affine"],
@@ -88,7 +111,7 @@ def dequantize_affine(context, node):
     out_np_dtype = None
     if len(inputs) > 7:
         out_np_dtype = NUM_TO_NUMPY_DTYPE[inputs[7].val]
-        logger.warning(
+        _logger.warning(
             f"Core ML ignores output_dtype {out_np_dtype} on torchao.dequantize_affine and instead uses the native precision."
         )
 
 
@@ -0,0 +1,24 @@
+# Copyright © 2023 Apple Inc. All rights reserved.
+#
+# Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+import logging
+import os
+from typing import Optional
+
+
+def get_coreml_log_level(default_level: int) -> Optional[str]:
+    level_str = os.environ.get("ET_COREML_LOG_LEVEL", "").upper()
+    if level_str == "":
+        return default_level
+
+    level_map = {
+        "DEBUG": logging.DEBUG,
+        "INFO": logging.INFO,
+        "WARNING": logging.WARNING,
+        "ERROR": logging.ERROR,
+        "CRITICAL": logging.CRITICAL,
+    }
+    if level_str not in level_map:
+        raise ValueError(f"Invalid ET_COREML_LOG_LEVEL: {level_str}")
+    return level_map[level_str]
@@ -10,6 +10,8 @@
 import torch
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
+
+from executorch.backends.apple.coreml.logging import get_coreml_log_level
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 from executorch.exir.backend.partitioner import (
@@ -23,7 +25,7 @@
 from torch.fx.passes.operator_support import OperatorSupportBase
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+logger.setLevel(get_coreml_log_level(default_level=logging.INFO))
 
 
 def _is_view_op(op: torch._ops.OpOverload) -> bool:
 
@@ -7,6 +7,7 @@
 
 import torch
 from executorch.exir.pass_base import ExportPass, PassResult
+from torch._export.utils import is_buffer, is_param
 
 
 class UnsqueezeScalarPlaceholdersPass(ExportPass):
@@ -19,23 +20,27 @@ def __init__(self, exported_program):
         self.exported_program = exported_program
         super().__init__()
 
-    def _is_inputs_to_buffers_or_parameters(self, node):
-        return (
-            node.name in self.exported_program.graph_signature.inputs_to_buffers
-            or node.name in self.exported_program.graph_signature.inputs_to_parameters
-        )
-
     def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             if node.op != "placeholder":
                 continue
             rank = node.meta["val"].dim()
             if rank == 0:
-                if not self._is_inputs_to_buffers_or_parameters(node):
+                if is_buffer(self.exported_program, node):
+                    name = self.exported_program.graph_signature.inputs_to_buffers[
+                        node.name
+                    ]
+                elif is_param(self.exported_program, node):
+                    name = self.exported_program.graph_signature.inputs_to_parameters[
+                        node.name
+                    ]
+                else:
                     continue
-                tensor = self.exported_program.state_dict[node.name]
+
+                tensor = self.exported_program.state_dict[name]
+
                 if tensor.dim() == 0:
-                    self.exported_program.state_dict[node.name] = tensor.unsqueeze(0)
+                    self.exported_program.state_dict[name] = tensor.unsqueeze(0)
                     node.meta["val"] = node.meta["val"].fake_mode.from_tensor(
                         tensor.unsqueeze(0), static_shapes=True
                     )
@@ -53,6 +58,9 @@ def ensures(self, graph_module: torch.fx.GraphModule):
             if node.op == "placeholder":
                 rank = node.meta["val"].dim()
                 if rank == 0:
-                    if not self._is_inputs_to_buffers_or_parameters(node):
+                    if not (
+                        is_buffer(self.exported_program, node)
+                        or is_param(self.exported_program, node)
+                    ):
                         continue
                     raise ValueError("Placeholders of rank 0 are not supported!")
@@ -54,7 +54,7 @@
 # if the quantizer here is different from the quantizer used to convert. It is
 # however useful for unit tests to separate the converted model from the fused
 # model, to be able to get reference numerics.
-# If this does not apply, please use quantize_and_fuse_pt2 instead.
+# If this does not apply, please use quantize_pt2 instead.
 def trace(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
@@ -85,6 +85,29 @@ def trace(
 
 
 def prepare_pt2(
+    model: torch.nn.Module,
+    inputs: tuple[object, ...],
+    quantizer: CadenceQuantizer,
+    dump_graphs: bool = False,
+) -> torch.fx.GraphModule:
+    """
+    Trace and Prepare a model using the given quantizer.
+    The quantizer must be supplied and be the same as the one used to
+    fuse the model later, if applicable. If you do not expect that behavior,
+    please use quantize_pt2 instead, which will instantiate a
+    default quantizer for you if needed.
+    Returns a GraphModule with the prepared model.
+    """
+
+    traced_program = trace(model, inputs, dump_graphs=dump_graphs)
+    prepared_program = prepare_traced_pt2(
+        traced_program, quantizer, dump_graphs=dump_graphs
+    )
+
+    return prepared_program
+
+
+def prepare_traced_pt2(
     program: ExportedProgram,
     quantizer: CadenceQuantizer,
     dump_graphs: bool = False,
@@ -93,7 +116,7 @@ def prepare_pt2(
     Prepare a model using the given quantizer.
     The quantizer must be supplied and be the same as the one used to
     fuse the model later, if applicable. If you do not expect that behavior,
-    please use quantize_and_fuse_pt2 instead, which will instantiate a
+    please use quantize_pt2 instead, which will instantiate a
     default quantizer for you if needed.
     Returns a GraphModule with the prepared model.
     """
@@ -137,7 +160,7 @@ def fuse_pt2(
     """
     Fuse a converted graph module using the given quantizer.
     The quantizer must be the same as the one used to convert the model.
-    If you do not expect that behavior, please use quantize_and_fuse_pt2 instead,
+    If you do not expect that behavior, please use quantize_pt2 instead,
     which will instantiate a default quantizer for you if needed.
     Returns a GraphModule with the fused model.
     """
@@ -179,7 +202,7 @@ def quantize_pt2(
         logging.info(program.graph.print_tabular())
 
     # Get prepared graph module
-    prepared_gm = prepare_pt2(program, quantizer, dump_graphs=dump_graphs)
+    prepared_gm = prepare_pt2(model, inputs, quantizer, dump_graphs=dump_graphs)
 
     # Calibrate
     # If no calibration data is provided, use the inputs
 
@@ -19,7 +19,6 @@
     export_to_executorch_gen_etrecord,
     fuse_pt2,
     prepare_pt2,
-    trace,
 )
 
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceDefaultQuantizer
@@ -50,11 +49,8 @@ def export_model(
     # Instantiate the quantizer
     quantizer = CadenceDefaultQuantizer()
 
-    # Trace the model
-    ep = trace(model, example_inputs)
-
     # Prepare the model
-    prepared_gm = prepare_pt2(ep, quantizer)
+    prepared_gm = prepare_pt2(model, example_inputs, quantizer)
 
     # Calibrate the model
     for samples in [example_inputs]: