Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
6420712
Update
manuelcandales Oct 10, 2025
d036c07
Update
manuelcandales Oct 10, 2025
1a22c5e
Update
manuelcandales Oct 10, 2025
d6f0bc9
Update
manuelcandales Oct 10, 2025
7e11615
Update
manuelcandales Oct 10, 2025
dfa435a
Update
manuelcandales Oct 10, 2025
648ee07
Update
manuelcandales Oct 10, 2025
3bea537
Update
manuelcandales Oct 10, 2025
ca5f1e5
Update
manuelcandales Oct 11, 2025
7e971b0
Update
manuelcandales Oct 11, 2025
f12117b
Update
manuelcandales Oct 11, 2025
5dfcd4f
Update
manuelcandales Oct 11, 2025
de83a9f
Update
manuelcandales Oct 11, 2025
c4c16aa
Update
manuelcandales Oct 13, 2025
ce0f085
Update
manuelcandales Oct 13, 2025
e391e17
Update
manuelcandales Oct 13, 2025
3572de8
Update
manuelcandales Oct 13, 2025
bcd7655
Update
manuelcandales Oct 13, 2025
71a079d
Update
manuelcandales Oct 13, 2025
2f092af
Update
manuelcandales Oct 13, 2025
89d3f14
Update
manuelcandales Oct 13, 2025
7590e37
Update
manuelcandales Oct 13, 2025
bea144f
Update
manuelcandales Oct 13, 2025
ade75f0
Update
manuelcandales Oct 13, 2025
094b8bb
Update
manuelcandales Oct 13, 2025
e9b3372
Update
manuelcandales Oct 13, 2025
81c4588
Update
manuelcandales Oct 13, 2025
8b1d309
Update
manuelcandales Oct 13, 2025
aec8796
Update
manuelcandales Oct 13, 2025
422e4ba
Update
manuelcandales Oct 13, 2025
d075361
Update
manuelcandales Oct 13, 2025
3229b92
Update
manuelcandales Oct 13, 2025
971a762
Update
manuelcandales Oct 15, 2025
3425f17
Update
manuelcandales Oct 15, 2025
c837491
Update
manuelcandales Oct 15, 2025
aea11e8
Update
manuelcandales Oct 15, 2025
7f178d3
Update
manuelcandales Oct 15, 2025
f46adc5
Update
manuelcandales Oct 15, 2025
16d863c
Update
manuelcandales Oct 15, 2025
c80142d
Update
manuelcandales Oct 15, 2025
c3e9d0a
Update
manuelcandales Oct 15, 2025
780d883
Update
manuelcandales Oct 15, 2025
b782bb5
Update
manuelcandales Oct 15, 2025
cf93ffd
Update
manuelcandales Oct 15, 2025
4eaa345
Update
manuelcandales Oct 15, 2025
61ead64
Update
manuelcandales Oct 15, 2025
750badf
Update
manuelcandales Oct 15, 2025
71f87b6
Update
manuelcandales Oct 15, 2025
930f6b9
Update
manuelcandales Oct 15, 2025
2667a0c
Update
manuelcandales Oct 15, 2025
6a6ba04
Update
manuelcandales Oct 15, 2025
95a7024
Update
manuelcandales Oct 15, 2025
f214162
Update
manuelcandales Oct 15, 2025
e8b9828
Update
manuelcandales Oct 15, 2025
7c1b9b2
Update
manuelcandales Oct 15, 2025
d37e7ef
Update
manuelcandales Oct 15, 2025
1506e5f
Update
manuelcandales Oct 15, 2025
6f6fd58
Update
manuelcandales Oct 15, 2025
4367977
Update
manuelcandales Oct 15, 2025
9d69769
Update
manuelcandales Oct 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions backends/aoti/aoti_model_container.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ AOTInductorModelContainerGetNumOutputsFunc
AOTInductorModelContainerGetNumOutputs = nullptr;
AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr;

// Global function pointers needed by Metal backend
AOTInductorModelContainerGetInputNameFunc
AOTInductorModelContainerGetInputName = nullptr;
AOTInductorModelContainerGetNumConstantsFunc
AOTInductorModelContainerGetNumConstants = nullptr;

} // extern "C"

} // namespace aoti
Expand Down
16 changes: 16 additions & 0 deletions backends/aoti/aoti_model_container.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,22 @@ extern AOTInductorModelContainerGetNumOutputsFunc
AOTInductorModelContainerGetNumOutputs;
extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun;

// Function pointer types needed by Metal backend
using AOTInductorModelContainerGetInputNameFunc = AOTIRuntimeError (*)(
AOTInductorModelContainerHandle container_handle,
size_t input_idx,
const char** input_name);

using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)(
AOTInductorModelContainerHandle container_handle,
size_t* num_constants);

// Global function pointers needed by Metal backend
extern AOTInductorModelContainerGetInputNameFunc
AOTInductorModelContainerGetInputName;
extern AOTInductorModelContainerGetNumConstantsFunc
AOTInductorModelContainerGetNumConstants;

} // extern "C"

// AOTI Delegate Handle structure
Expand Down
5 changes: 5 additions & 0 deletions backends/aoti/common_shims.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,11 @@ void cleanup_tensor_metadata() {
internal::tensor_to_strides.clear();
}

// Needed by Metal backend
size_t aoti_torch_dtype_element_size(int32_t dtype) {
return dtype_to_element_size(dtype);
}

} // extern "C"

} // namespace aoti
Expand Down
3 changes: 3 additions & 0 deletions backends/aoti/common_shims.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ void aoti_torch_grad_mode_set_enabled(bool enabled);
// Cleanup functions for clearing global state
void cleanup_tensor_metadata();

// Needed by Metal backend
size_t aoti_torch_dtype_element_size(int32_t dtype);

} // extern "C"

} // namespace aoti
Expand Down
173 changes: 173 additions & 0 deletions backends/apple/metal/metal_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import contextlib
import os
import typing
from enum import Enum

from typing import Any, Dict, final, List, Optional, Set

import torch
from executorch.backends.apple.metal.replace_slice_copy_with_slice import (
ReplaceSliceCopyWithSlicePass,
)
from executorch.exir._serialize._named_data_store import NamedDataStore
from executorch.exir._warnings import experimental
from executorch.exir.backend.backend_details import (
BackendDetails,
ExportedProgram,
PreprocessResult,
)
from executorch.exir.backend.compile_spec_schema import CompileSpec
from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
from torch.export.passes import move_to_device_pass


# exist fallback operators in et namespace;
supported_fallback_kernels: Dict[str, Any] = {
"aoti_torch_mps_addmm_out": None,
"aoti_torch_mps_convolution": None,
"aoti_torch_mps_mm_out": None,
"at::_ops::_scaled_dot_product_attention_math_for_mps::call": None,
}

# required fallback kernels but not supported
missing_fallback_kernels: Set[str] = set()


class COMPILE_SPEC_KEYS(Enum):
METHOD_NAME = "method_name"


# context manager for non-fallback guarantee
# it will raise exception when generating fallback kernels during aoti compile
@contextlib.contextmanager
def collect_unsupported_fallback_kernels():
original_generate_c_shim_extern_kernel_call = (
CppWrapperCpu.generate_c_shim_extern_kernel_call
)

def generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels(
self,
kernel: str,
args: list[str],
device: str,
*,
debug_args: Optional[list[str]] = None,
debug_handle: Optional[int] = None,
):
if kernel not in supported_fallback_kernels:
missing_fallback_kernels.add(kernel)

original_generate_c_shim_extern_kernel_call(
self, kernel, args, device, debug_args=debug_args, debug_handle=debug_handle
)

CppWrapperCpu.generate_c_shim_extern_kernel_call = (
generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels
)
try:
yield
finally:
CppWrapperCpu.generate_c_shim_extern_kernel_call = (
original_generate_c_shim_extern_kernel_call
)


@final
@experimental(
"This API and all of Metal backend related functionality are experimental."
)
class MetalBackend(BackendDetails):
@staticmethod
def preprocess(
edge_program: ExportedProgram,
compile_specs: List[CompileSpec],
) -> PreprocessResult:
print("entering the lowerable parts in MetalBackend.preprocess....")
# Move the edge_program from CPU to MPS for aoti compile
mps_edge_program = move_to_device_pass(edge_program, "mps")

# replace slice_copy with slice
ReplaceSliceCopyWithSlicePass()(mps_edge_program.graph_module)

edge_program_module = mps_edge_program.module()

# Grab all input placeholders from the graph
user_input_names = mps_edge_program.graph_signature.user_inputs
user_input_placeholders = []
for node in mps_edge_program.graph.nodes:
if node.op == "placeholder" and node.name in user_input_names:
user_input_placeholders.append(node.meta["val"])

# Base options for all devices
options: dict[str, typing.Any] = {
# Do not link against the full PyTorch/libtorch library
"aot_inductor.link_libtorch": False,
# Package model constants and other generated files directly in the shared object (.so) file
"aot_inductor.package_constants_in_so": True,
# Enable maximum automatic tuning for optimal performance
"max_autotune": True,
# "aot_inductor.debug_compile": True,
# "aot_inductor.force_mmap_weights": False,
}

with collect_unsupported_fallback_kernels():
so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options) # type: ignore[arg-type]
if len(missing_fallback_kernels) > 0:
formatted_kernels = "\n - ".join(sorted(missing_fallback_kernels))
raise RuntimeError(
f"Missing fallback kernels ({len(missing_fallback_kernels)} total):\n - {formatted_kernels}\n"
"Please add them to the AOTI backend."
)

# pyre-ignorep[6]: Incompatible parameter type
with open(so_path, "rb") as f:
so_data = f.read()

named_data_store = NamedDataStore()
method_name = MetalBackend.method_name_from_compile_specs(compile_specs)
named_data_store.add_named_data(
method_name + "_so_blob", so_data, 1, "aoti_metal_blob"
)

# Clean up the generated so file; it has been packaged into the NamdeDataStore
# pyre-ignorep[6]: Incompatible parameter type
os.remove(so_path)

return PreprocessResult(
processed_bytes=b"",
debug_handle_map={},
data_store_output=named_data_store.get_named_data_store_output(),
)

@staticmethod
def generate_method_name_compile_spec(
method_name: str,
) -> CompileSpec:
"""
Returns the compile spec representing the model compute precision, for additional details
please refer to the documentation for ``coremltools.precision``.
"""
return CompileSpec(
COMPILE_SPEC_KEYS.METHOD_NAME.value,
method_name.encode("utf-8"),
)

@staticmethod
def method_name_from_compile_specs(
compile_specs: List[CompileSpec],
) -> str:
"""
Returns the method name from the compile specs.
"""
for spec in compile_specs:
if spec.key == COMPILE_SPEC_KEYS.METHOD_NAME.value:
return spec.value.decode("utf-8")
raise RuntimeError(
f"Could not find method name in compile specs: {compile_specs}"
)
77 changes: 77 additions & 0 deletions backends/apple/metal/metal_partitioner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from typing import Callable, Dict, final, List, Optional, Tuple

import torch
from executorch.backends.apple.metal.metal_backend import MetalBackend # usort: skip
from executorch.exir._warnings import experimental
from executorch.exir.backend.compile_spec_schema import CompileSpec
from executorch.exir.backend.partitioner import (
DelegationSpec,
Partitioner,
PartitionResult,
)
from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
from torch.export.exported_program import ExportedProgram


@final
@experimental(
"This API and all of Metal backend related functionality are experimental."
)
class MetalPartitioner(Partitioner):
"""
Metal partitioner for AOTInductor backend integration.

This partitioner creates a single partition containing all operators from the input graph.
It skips core ATen decomposition, allowing the Metal backend to handle decomposition using
AOTInductor's MPS-specific decomposition table.

Only operators that cannot be handled by the aoti-mps library will be excluded from
the partition and fall back to ExecuTorch's default or custom handling.
"""

def __init__(self, compile_spec: List[CompileSpec]) -> None:
self.delegation_spec = DelegationSpec(MetalBackend.__name__, compile_spec)

def partition(self, exported_program: ExportedProgram) -> PartitionResult:
"""
Fully delegate the graph to AOTInductor by tagging all nodes as a single partition.
"""

partition_tags: Dict[str, DelegationSpec] = {}
tag = "tag0"

for node in exported_program.graph.nodes:
if node.op != "call_function":
continue
node.meta["delegation_tag"] = tag

partition_tags[tag] = self.delegation_spec

tag_constant_data(exported_program)
tag_mutated_buffer(exported_program)

return PartitionResult(
tagged_exported_program=exported_program, partition_tags=partition_tags
)

def ops_to_not_decompose(
self, ep: ExportedProgram
) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
"""
Return a list of operations that should not be decomposed and let the AOT compiler handle them.
Currently we skip ATen decompositon for all ops, and let the Metal backend handle them.
"""
do_not_decompose = set()

for node in ep.graph.nodes:
if node.op == "call_function" and isinstance(
node.target, torch._ops.OpOverload
):
do_not_decompose.add(node.target)
return list(do_not_decompose), None
Loading
Loading