Update

GregoryComer · GregoryComer · commit 5c4c6cecda96 · 2025-07-22T16:16:11.000-07:00
[ghstack-poisoned]
diff --git a/backends/apple/coreml/partition/coreml_partitioner.py b/backends/apple/coreml/partition/coreml_partitioner.py
@@ -23,25 +23,27 @@
 from torch.fx.passes.operator_support import OperatorSupportBase
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
+logger.setLevel(logging.INFO)
 
 
-class OperatorsSupportedForCoreMLBackend(OperatorSupportBase):
+class _OperatorsSupportedForCoreMLBackend(OperatorSupportBase):
     def __init__(
         self,
         skip_ops_for_coreml_delegation: Optional[List[str]] = None,
         lower_full_graph: bool = False,
+        log: bool = False,
     ) -> None:
         if skip_ops_for_coreml_delegation is None:
             skip_ops_for_coreml_delegation = []
         super().__init__()
         self.skip_ops_for_coreml_delegation = skip_ops_for_coreml_delegation
         self.lower_full_graph = lower_full_graph
         self._logged_msgs = set()
+        self._log = log
 
     def log_once(self, msg: str) -> None:
-        if msg not in self._logged_msgs:
-            logging.info(msg)
+        if self._log and msg not in self._logged_msgs:
+            logger.info(msg)
             self._logged_msgs.add(msg)
 
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
@@ -154,8 +156,10 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
 
         capability_partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
-            OperatorsSupportedForCoreMLBackend(
-                self.skip_ops_for_coreml_delegation, self.lower_full_graph
+            _OperatorsSupportedForCoreMLBackend(
+                self.skip_ops_for_coreml_delegation,
+                self.lower_full_graph,
+                log=True,
             ),
             allows_single_node_partition=True,
         )
@@ -191,8 +195,10 @@ def ops_to_not_decompose(
         self, ep: ExportedProgram
     ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
         do_not_decompose = []
-        op_support = OperatorsSupportedForCoreMLBackend(
-            self.skip_ops_for_coreml_delegation, self.lower_full_graph
+        op_support = _OperatorsSupportedForCoreMLBackend(
+            self.skip_ops_for_coreml_delegation,
+            self.lower_full_graph,
+            log=False,
         )
 
         # CoreML prevents certain ops (like triu) from lowering to CoreML when put in the ExecuTorch op namespace
diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py
@@ -16,7 +16,6 @@
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
 from executorch.exir.backend.utils import format_delegated_graph
-from executorch.runtime import Runtime
 
 
 @torch.library.custom_op("unsupported::linear", mutates_args=())
@@ -37,7 +36,13 @@ def _(
     return torch.ops.aten.linear.default(x, w, b)
 
 
-_TEST_RUNTIME = sys.platform == "darwin"
+def is_fbcode():
+    return not hasattr(torch.version, "git_version")
+
+
+_TEST_RUNTIME = (sys.platform == "darwin") and not is_fbcode()
+if _TEST_RUNTIME:
+    from executorch.runtime import Runtime
 
 
 class TestCoreMLPartitioner(unittest.TestCase):
diff --git a/backends/apple/coreml/test/test_torch_ops.py b/backends/apple/coreml/test/test_torch_ops.py
@@ -14,12 +14,20 @@
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
-from executorch.runtime import Runtime
 from torchao.quantization import IntxWeightOnlyConfig, PerAxis, PerGroup, quantize_
 
-_TEST_RUNTIME = sys.platform == "darwin" and tuple(
-    map(int, platform.mac_ver()[0].split("."))
-) >= (15, 0)
+
+def is_fbcode():
+    return not hasattr(torch.version, "git_version")
+
+
+_TEST_RUNTIME = (
+    (sys.platform == "darwin")
+    and not is_fbcode()
+    and tuple(map(int, platform.mac_ver()[0].split("."))) >= (15, 0)
+)
+if _TEST_RUNTIME:
+    from executorch.runtime import Runtime
 
 
 class TestTorchOps(unittest.TestCase):
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
@@ -77,9 +77,11 @@ find_package(gflags REQUIRED)
 # llama_main: test binary to run llama, with tokenizer and sampler integrated
 #
 
-# find `executorch` libraries Same as for gflags
-set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)
-find_package(executorch CONFIG REQUIRED)
+# find `executorch` libraries. CMAKE_PREFIX_PATH would work for host
+# compilation, but CMAKE_FIND_ROOT_PATH appears to be necessary for
+# cross-compiling (e.g., to Android) to work as well.
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
 target_link_options_shared_lib(executorch)
 
 # llama_runner library
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
@@ -76,8 +76,8 @@ find_package(gflags REQUIRED)
 #
 
 # find `executorch` libraries Same as for gflags
-set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)
-find_package(executorch CONFIG REQUIRED)
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
 target_link_options_shared_lib(executorch)
 
 # llava_runner library
diff --git a/examples/models/phi-3-mini/CMakeLists.txt b/examples/models/phi-3-mini/CMakeLists.txt
@@ -24,8 +24,8 @@ set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
 set(_common_include_directories
     ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
 )
-set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)
-find_package(executorch CONFIG REQUIRED)
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
 
 target_link_options_shared_lib(executorch)
 
diff --git a/exir/backend/test/test_backends.py b/exir/backend/test/test_backends.py
@@ -1033,7 +1033,7 @@ def false_fn(x, y):
 
         def f(x, y):
             x = x + y
-            x = torch.ops.higher_order.cond(x[0][0] == 1, true_fn, false_fn, [x, y])
+            x = torch.cond(x[0][0] == 1, true_fn, false_fn, [x, y])
             x = x - y
             return x
 
diff --git a/exir/program/_program.py b/exir/program/_program.py
@@ -1076,6 +1076,28 @@ def keep(op):
     return list(filter(keep, preserve_ops))
 
 
+def _can_skip_using_EDGE_DO_NOT_DECOMP(
+    partitioner: Dict[str, List[Partitioner]], aten_programs: Dict[str, ExportedProgram]
+) -> bool:
+    # THe current design of using EDGE_DO_NOT_DECOMP to prevent decomposition
+    # has long standing issues.  _remove_invalid_ops_for_not_decompose was a band-aid to
+    # fix some of the issues, but more issues are coming up over time, including a new issue with SDPA
+    # and contiguous views: https://fb.workplace.com/groups/pytorch.edge.users/permalink/1796069037930048/
+    # EDGE_DO_NOT_DECOMP is only needed by partitioners that specify check_op_support
+    # As a temp fix, we give a more reliable path for backends that do not specify check_op_support
+    can_skip_using_EDGE_DO_NOT_DECOMP = True
+    for name, program in aten_programs.items():
+        if partitioner is not None:
+            for curr_partitioner in partitioner.get(name, []):
+                (
+                    curr_ops_no_decomp,
+                    check_op_support,
+                ) = curr_partitioner.ops_to_not_decompose(program)
+                if check_op_support is not None:
+                    can_skip_using_EDGE_DO_NOT_DECOMP = False
+    return can_skip_using_EDGE_DO_NOT_DECOMP
+
+
 def _gen_edge_manager_for_partitioners(
     partitioner: Dict[str, List[Partitioner]],
     aten_programs: Dict[str, ExportedProgram],
@@ -1095,37 +1117,56 @@ def _gen_edge_manager_for_partitioners(
           on nodes with preserved aten targets. They are then replaces with transformed ops to
           keep them through the second pass of decompositions
     """
+    can_skip_using_EDGE_DO_NOT_DECOMP = _can_skip_using_EDGE_DO_NOT_DECOMP(
+        partitioner, aten_programs
+    )
     ops_set_to_not_decompose_by_program = {}
     edge_programs: Dict[str, ExportedProgram] = {}
     for name, program in aten_programs.items():
+        # Functionalize program before asking partitioners to preserve ops
+        program = program.run_decompositions({})
+
         if partitioner is not None:
             # preserve all ops listed by all partitioners first
             all_ops_no_decomp = set()
+            all_ops_no_decomp_needing_preservation = []
             for curr_partitioner in partitioner.get(name, []):
                 curr_ops_no_decomp, _ = curr_partitioner.ops_to_not_decompose(program)
-                curr_ops_no_decomp = _remove_invalid_ops_for_not_decompose(
-                    curr_ops_no_decomp
-                )
                 all_ops_no_decomp |= set(curr_ops_no_decomp)
 
-            table = _default_decomposition_table()
+            # If not using the can_skip_using_EDGE_DO_NOT_DECOMP path, we need to remove invalid ops
+            # Otherwise there will be issues
+            if not can_skip_using_EDGE_DO_NOT_DECOMP:
+                all_ops_no_decomp = _remove_invalid_ops_for_not_decompose(
+                    list(all_ops_no_decomp)
+                )
+                all_ops_no_decomp = set(all_ops_no_decomp)
 
+            # Run default decompositions, except for those in all_ops_no_decomp
+            table = _default_decomposition_table()
             for op in all_ops_no_decomp:
-                table.pop(op, None)
-
+                if table.pop(op, None) is not None:
+                    all_ops_no_decomp_needing_preservation.append(op)
             program = program.run_decompositions(table)
+
             # Among all the preserved aten ops, use the check_op_fn to do an additional
             # check on which ops need to be preserved and which ops need to be decomposed
             # Those which are truly preserved will be replaced with transformed ops
-            ops_set_to_not_decompose_by_program[name] = (
-                _replace_aten_ops_with_transformed_ops(name, program, partitioner) or []
-            )
-        program = program.run_decompositions(_default_decomposition_table())
+            if can_skip_using_EDGE_DO_NOT_DECOMP:
+                ops_set_to_not_decompose_by_program[name] = (
+                    all_ops_no_decomp_needing_preservation
+                )
+            else:
+                ops_set_to_not_decompose_by_program[name] = (
+                    _replace_aten_ops_with_transformed_ops(name, program, partitioner)
+                    or []
+                )
 
-        _restore_transformed_ops_to_aten_ops(program)
+        if not can_skip_using_EDGE_DO_NOT_DECOMP:
+            program = program.run_decompositions(_default_decomposition_table())
+            _restore_transformed_ops_to_aten_ops(program)
 
         edge_programs[name] = program
-
         edge_programs[name] = _generate_edge_program(
             config,
             program,
@@ -1169,7 +1210,7 @@ def collect_named_data_store_outputs(
 
 
 @et_logger("to_edge_transform_and_lower")
-def to_edge_transform_and_lower(
+def to_edge_transform_and_lower(  # noqa: C901
     programs: Union[ExportedProgram, Dict[str, ExportedProgram]],
     transform_passes: Optional[
         Union[Sequence[PassType], Dict[str, Sequence[PassType]]]
@@ -1234,6 +1275,9 @@ def to_edge_transform_and_lower(
     elif partitioner is None:
         partitioner = {name: [] for name in aten_programs.keys()}
 
+    can_skip_using_EDGE_DO_NOT_DECOMP = _can_skip_using_EDGE_DO_NOT_DECOMP(
+        partitioner, aten_programs
+    )
     edge_manager = _gen_edge_manager_for_partitioners(
         partitioner, aten_programs, config, constant_methods
     )
@@ -1259,7 +1303,8 @@ def to_edge_transform_and_lower(
             curr_op_set, check_op_support = curr_partitioner.ops_to_not_decompose(
                 program
             )
-            curr_op_set = _remove_invalid_ops_for_not_decompose(curr_op_set)
+            if not can_skip_using_EDGE_DO_NOT_DECOMP:
+                curr_op_set = _remove_invalid_ops_for_not_decompose(curr_op_set)
             ops_set_to_not_decompose = ops_set_to_not_decompose.union(curr_op_set)
             _sanity_check_graph_for_non_decomp_ops(
                 name,
diff --git a/exir/tests/control_flow_models.py b/exir/tests/control_flow_models.py
@@ -20,9 +20,7 @@ def true_branch(x):
         def false_branch(x):
             return x * x
 
-        return torch.ops.higher_order.cond(
-            inp.sum() > 4, true_branch, false_branch, [inp]
-        )
+        return torch.cond(inp.sum() > 4, true_branch, false_branch, [inp])
 
     def get_random_inputs(self):
         return (torch.rand(5),)
@@ -39,9 +37,7 @@ def true_branch(x):
         def false_branch(x):
             return x * x * x
 
-        return torch.ops.higher_order.cond(
-            inp.sum() > 4, true_branch, false_branch, [inp]
-        )
+        return torch.cond(inp.sum() > 4, true_branch, false_branch, [inp])
 
     def get_upper_bound_inputs(self):
         return (torch.rand(8),)
@@ -72,9 +68,7 @@ def true_branch(x):
         def false_branch(x):
             return x * 2
 
-        return torch.ops.higher_order.cond(
-            inp.sum() > 4, true_branch, false_branch, [inp]
-        )
+        return torch.cond(inp.sum() > 4, true_branch, false_branch, [inp])
 
     def get_random_inputs(self):
         return (torch.eye(5) * 2,)
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
@@ -1463,9 +1463,7 @@ def forward(self, pred, x):
                 out = torch.nn.functional.linear(
                     x, self.w.to(torch.float16).to(torch.float32)
                 )
-                return torch.ops.higher_order.cond(
-                    pred, self.true_fn, self.false_fn, [out]
-                )
+                return torch.cond(pred, self.true_fn, self.false_fn, [out])
 
         mod = Module()
         x = torch.randn([3, 3])
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
@@ -60,7 +60,6 @@ set_target_properties(fbjni PROPERTIES
   IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/third-party/fbjni/prefab/modules/fbjni/libs/android.${ANDROID_ABI}/libfbjni.so"
 )
 
-set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../lib/cmake/ExecuTorch)
 target_link_options_shared_lib(executorch)
 
 add_library(executorch_jni SHARED jni/jni_layer.cpp jni/log.cpp jni/jni_layer_runtime.cpp)
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchError.h b/extension/apple/ExecuTorch/Exported/ExecuTorchError.h
@@ -19,30 +19,34 @@ FOUNDATION_EXPORT NSErrorDomain const ExecuTorchErrorDomain NS_SWIFT_NAME(ErrorD
  */
 typedef NS_ERROR_ENUM(ExecuTorchErrorDomain, ExecuTorchErrorCode) {
   // System errors.
-  ExecuTorchErrorCodeOk                             = 0,
-  ExecuTorchErrorCodeInternal                       = 1,
-  ExecuTorchErrorCodeInvalidState                   = 2,
-  ExecuTorchErrorCodeEndOfMethod                    = 3,
+  ExecuTorchErrorCodeOk                              = 0,
+  ExecuTorchErrorCodeInternal                        = 1,
+  ExecuTorchErrorCodeInvalidState                    = 2,
+  ExecuTorchErrorCodeEndOfMethod                     = 3,
 
   // Logical errors.
-  ExecuTorchErrorCodeNotSupported                   = 16,
-  ExecuTorchErrorCodeNotImplemented                 = 17,
-  ExecuTorchErrorCodeInvalidArgument                = 18,
-  ExecuTorchErrorCodeInvalidType                    = 19,
-  ExecuTorchErrorCodeOperatorMissing                = 20,
+  ExecuTorchErrorCodeNotSupported                    = 16,
+  ExecuTorchErrorCodeNotImplemented                  = 17,
+  ExecuTorchErrorCodeInvalidArgument                 = 18,
+  ExecuTorchErrorCodeInvalidType                     = 19,
+  ExecuTorchErrorCodeOperatorMissing                 = 20,
+
+  // Registration errors.
+  ExecuTorchErrorCodeRegistrationExceedingMaxKernels = 21,
+  ExecuTorchErrorCodeRegistrationAlreadyRegistered   = 22,
 
   // Resource errors.
-  ExecuTorchErrorCodeNotFound                       = 32,
-  ExecuTorchErrorCodeMemoryAllocationFailed         = 33,
-  ExecuTorchErrorCodeAccessFailed                   = 34,
-  ExecuTorchErrorCodeInvalidProgram                 = 35,
-  ExecuTorchErrorCodeInvalidExternalData            = 36,
-  ExecuTorchErrorCodeOutOfResources                 = 37,
+  ExecuTorchErrorCodeNotFound                        = 32,
+  ExecuTorchErrorCodeMemoryAllocationFailed          = 33,
+  ExecuTorchErrorCodeAccessFailed                    = 34,
+  ExecuTorchErrorCodeInvalidProgram                  = 35,
+  ExecuTorchErrorCodeInvalidExternalData             = 36,
+  ExecuTorchErrorCodeOutOfResources                  = 37,
 
   // Delegate errors.
-  ExecuTorchErrorCodeDelegateInvalidCompatibility   = 48,
-  ExecuTorchErrorCodeDelegateMemoryAllocationFailed = 49,
-  ExecuTorchErrorCodeDelegateInvalidHandle          = 50,
+  ExecuTorchErrorCodeDelegateInvalidCompatibility    = 48,
+  ExecuTorchErrorCodeDelegateMemoryAllocationFailed  = 49,
+  ExecuTorchErrorCodeDelegateInvalidHandle           = 50,
 } NS_SWIFT_NAME(ErrorCode);
 
 /**
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchError.m b/extension/apple/ExecuTorch/Exported/ExecuTorchError.m
@@ -30,6 +30,10 @@
       return @"Invalid type";
     case ExecuTorchErrorCodeOperatorMissing:
       return @"Operator missing";
+    case ExecuTorchErrorCodeRegistrationExceedingMaxKernels:
+      return @"Exceeded maximum number of kernels";
+    case ExecuTorchErrorCodeRegistrationAlreadyRegistered:
+      return @"Kernel is already registered";
     case ExecuTorchErrorCodeNotFound:
       return @"Resource not found";
     case ExecuTorchErrorCodeMemoryAllocationFailed:
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
diff --git a/runtime/executor/tensor_parser_aten.cpp b/runtime/executor/tensor_parser_aten.cpp
diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp
diff --git a/third-party/ao b/third-party/ao

Original file line number	Diff line number	Diff line change
`@@ -76,8 +76,8 @@ find_package(gflags REQUIRED)`
`76`	`76`	`#`
`77`	`77`
`78`	`78`	# find `executorch` libraries Same as for gflags
`79`		`-set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)`
`80`		`-find_package(executorch CONFIG REQUIRED)`
	`79`	`+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)`
	`80`	`+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)`
`81`	`81`	`target_link_options_shared_lib(executorch)`
`82`	`82`
`83`	`83`	`# llava_runner library`
Original file line number	Diff line number	Diff line change
`@@ -24,8 +24,8 @@ set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")`
`24`	`24`	`set(_common_include_directories`
`25`	`25`	`${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10`
`26`	`26`	`)`
`27`		`-set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)`
`28`		`-find_package(executorch CONFIG REQUIRED)`
	`27`	`+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)`
	`28`	`+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)`
`29`	`29`
`30`	`30`	`target_link_options_shared_lib(executorch)`
`31`	`31`