merge lastest update

Gasoonjia · Gasoonjia · commit 1e90fd0a2545 · 2025-11-20T15:04:56.000-08:00
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
@@ -42,30 +42,38 @@ class AotiBackend(ABC):
     BackendDetails and AotiBackend to get the full functionality.
     """
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def get_device_name() -> str:
+    def get_device_name(cls) -> str:
         """Return the device name for this backend (e.g., 'cuda', 'metal')."""
         pass
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def get_supported_fallback_kernels() -> Dict[str, Any]:
+    def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
         """Return the set of supported fallback kernels for this backend."""
         pass
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def get_decomposition_table() -> Dict[Any, Any]:
+    def get_decomposition_table(cls) -> Dict[Any, Any]:
         """Return the decomposition table for this backend."""
         pass
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def get_aoti_compile_options() -> Dict[str, typing.Any]:
+    def get_aoti_compile_options(
+        cls, compile_specs: List[CompileSpec]
+    ) -> Dict[str, typing.Any]:
         """Return the AOTInductor compilation options for this backend."""
         pass
 
+    @classmethod
+    @abstractmethod
+    def get_custom_passes(cls) -> List[typing.Any]:
+        """Return the list of custom passes to apply after ReplaceViewCopyWithViewPass and before decomposition."""
+        pass
+
     @classmethod
     @contextlib.contextmanager
     def collect_unsupported_fallback_kernels(cls, missing_fallback_kernels: Set[str]):
@@ -145,7 +153,7 @@ def preprocess(
         """
         device_name = cls.get_device_name()
         decomposition_table = cls.get_decomposition_table()
-        options = cls.get_aoti_compile_options()
+        options = cls.get_aoti_compile_options(compile_specs)
 
         # Move the edge_program to the target device
         device_edge_program = move_to_device_pass(
@@ -155,6 +163,11 @@ def preprocess(
         # Replace view_copy with view
         ReplaceViewCopyWithViewPass()(device_edge_program.graph_module)
 
+        # Apply custom backend-specific passes
+        custom_passes = cls.get_custom_passes()
+        for custom_pass in custom_passes:
+            custom_pass(device_edge_program.graph_module)
+
         # Run decompositions if any
         if decomposition_table:
             device_edge_program = device_edge_program.run_decompositions(
@@ -236,8 +249,9 @@ def preprocess(
             data_store_output=named_data_store.get_named_data_store_output(),
         )
 
-    @staticmethod
+    @classmethod
     def generate_method_name_compile_spec(
+        cls,
         method_name: str,
     ) -> CompileSpec:
         """
@@ -248,8 +262,9 @@ def generate_method_name_compile_spec(
             method_name.encode("utf-8"),
         )
 
-    @staticmethod
+    @classmethod
     def method_name_from_compile_specs(
+        cls,
         compile_specs: List[CompileSpec],
     ) -> str:
         """
diff --git a/backends/apple/metal/metal_backend.py b/backends/apple/metal/metal_backend.py
@@ -5,11 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 import typing
-from typing import Any, Dict, final
+from typing import Any, Dict, final, List
 
 from executorch.backends.aoti.aoti_backend import AotiBackend
 from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_details import BackendDetails
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 
 @final
@@ -23,25 +24,34 @@ class MetalBackend(AotiBackend, BackendDetails):
     using the Executorch runtime.
     """
 
-    @staticmethod
-    def get_device_name() -> str:
+    @classmethod
+    def get_device_name(cls) -> str:
         return "metal"
 
-    @staticmethod
-    def get_supported_fallback_kernels() -> Dict[str, Any]:
+    @classmethod
+    def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
         return {
             "aoti_torch_mps_addmm_out": None,
             "aoti_torch_mps_convolution": None,
             "aoti_torch_mps_mm_out": None,
             "at::_ops::_scaled_dot_product_attention_math_for_mps::call": None,
         }
 
-    @staticmethod
-    def get_decomposition_table() -> Dict[Any, Any]:
+    @classmethod
+    def get_decomposition_table(cls) -> Dict[Any, Any]:
         return {}
 
-    @staticmethod
-    def get_aoti_compile_options() -> Dict[str, typing.Any]:
+    @classmethod
+    def get_custom_passes(cls) -> List[typing.Any]:
+        """Return Metal-specific passes (currently none)"""
+        return []
+
+    @classmethod
+    def get_aoti_compile_options(
+        cls, compile_specs: List[CompileSpec]
+    ) -> Dict[str, typing.Any]:
+        """Get AOTI compile options for Metal backend."""
+        _ = compile_specs  # Unused, but required by interface
         return {
             # Do not link against the full PyTorch/libtorch library
             "aot_inductor.link_libtorch": False,
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -5,12 +5,17 @@
 # LICENSE file in the root directory of this source tree.
 
 import typing
-from typing import Any, Dict, final
+from importlib import resources
+from typing import Any, Dict, final, List
 
 import torch
 from executorch.backends.aoti.aoti_backend import AotiBackend
+from executorch.backends.cuda.triton.replacement_pass import (
+    ReplaceEdgeOpWithTritonOpPass,
+)
 from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_details import BackendDetails
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch._inductor.decomposition import conv1d_to_conv2d
 
 
@@ -25,25 +30,37 @@ class CudaBackend(AotiBackend, BackendDetails):
     using the Executorch runtime.
     """
 
-    @staticmethod
-    def get_device_name() -> str:
+    @classmethod
+    def get_device_name(cls) -> str:
         return "cuda"
 
-    @staticmethod
-    def get_supported_fallback_kernels() -> Dict[str, Any]:
+    @classmethod
+    def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
         return {
             "at::_ops::_weight_int4pack_mm::call": None,
         }
 
-    @staticmethod
-    def get_decomposition_table() -> Dict[Any, Any]:
+    @classmethod
+    def get_decomposition_table(cls) -> Dict[Any, Any]:
         return {
             torch.ops.aten.conv1d.default: conv1d_to_conv2d,
         }
 
-    @staticmethod
-    def get_aoti_compile_options() -> Dict[str, typing.Any]:
-        return {
+    @classmethod
+    def get_custom_passes(cls) -> List[typing.Any]:
+        """Return CUDA-specific passes: ReplaceEdgeOpWithTritonOpPass"""
+        return [ReplaceEdgeOpWithTritonOpPass()]
+
+    @classmethod
+    def get_aoti_compile_options(
+        cls, compile_specs: List[CompileSpec]
+    ) -> Dict[str, typing.Any]:
+        """
+        Get AOTI compile options for CUDA backend.
+        Options may vary based on platform (Linux vs Windows).
+        """
+        # Base options for all platforms
+        options: Dict[str, typing.Any] = {
             # Disable this to support sdpa decomposition
             # TODO(gasoonjia): remove it after pin bump to latest pytorch
             "loop_ordering_after_fusion": False,
@@ -65,3 +82,35 @@ def get_aoti_compile_options() -> Dict[str, typing.Any]:
             # Use TRITON backend for convolution operations tuning only to avoid using operators in libtorch
             "max_autotune_conv_backends": "TRITON",
         }
+
+        # Parse compile_specs to check for platform
+        platform = "linux"
+        shim_library_path = None
+        for spec in compile_specs:
+            if spec.key == "platform":
+                platform = spec.value.decode("utf-8")
+            if spec.key == "shim_library_path":
+                shim_library_path = spec.value.decode("utf-8")
+
+        # Add platform-specific options
+        if platform == "windows":
+            # For Windows, get default shim library path if not provided
+            if shim_library_path is None:
+                lib_dir = resources.files("executorch").joinpath("data/lib")
+                shim_library_path = str(lib_dir)
+
+            options.update(
+                {
+                    "aot_inductor.cross_target_platform": "windows",
+                    "aot_inductor.aoti_shim_library": "aoti_cuda_shims",
+                    "aot_inductor.aoti_shim_library_path": shim_library_path,
+                    "aot_inductor.precompile_headers": False,
+                }
+            )
+        else:
+            # Linux platform
+            assert (
+                shim_library_path is None
+            ), "shim_library_path should not be set for Linux"
+
+        return options