diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS index f54a95229c6..3e412b6dc56 100644 --- a/backends/cuda/TARGETS +++ b/backends/cuda/TARGETS @@ -2,6 +2,22 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") oncall("executorch") +runtime.python_library( + name = "cuda_backend", + srcs = [ + "cuda_backend.py", + ], + visibility = [ + "//executorch/...", + ], + deps = [ + "//caffe2:torch", + "//executorch/exir/_serialize:lib", + "//executorch/exir/backend:backend_details", + "//executorch/exir/backend:compile_spec_schema", + ], +) + runtime.python_library( name = "cuda_partitioner", srcs = [ diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py new file mode 100644 index 00000000000..1942d5e24a3 --- /dev/null +++ b/backends/cuda/cuda_backend.py @@ -0,0 +1,171 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import os +import typing + +from typing import Any, Dict, final, List, Optional, Set + +import torch +from executorch.exir._serialize._named_data_store import NamedDataStore +from executorch.exir._warnings import experimental +from executorch.exir.backend.backend_details import ( + BackendDetails, + ExportedProgram, + PreprocessResult, +) +from executorch.exir.backend.compile_spec_schema import CompileSpec +from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu +from torch.export.passes import move_to_device_pass + + +# exist fallback operators in et namespace; +supported_fallback_kernels: Dict[str, Any] = {} + +# required fallback kernels but not supported +missing_fallback_kernels: Set[str] = set() + + +# context manager for non-fallback guarantee +# it will raise exception when generating fallback kernels during aoti compile +@contextlib.contextmanager +def collect_unsupported_fallback_kernels(): + original_generate_c_shim_extern_kernel_call = ( + CppWrapperCpu.generate_c_shim_extern_kernel_call + ) + original_generate_fallback_kernel_with_runtime_lookup_aot = ( + CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot + ) + + def generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels( + self, + kernel: str, + args: list[str], + device: str, + *, + debug_args: Optional[list[str]] = None, + ): + if kernel not in supported_fallback_kernels: + missing_fallback_kernels.add(kernel) + + original_generate_c_shim_extern_kernel_call( + self, kernel, args, device, debug_args=debug_args + ) + + def generate_fallback_kernel_with_runtime_lookup_aot_and_collect_unsupported_kernels( + self, + op_overload, + raw_args, + output_args, + raw_outputs, + ): + # Extract kernel name for collection + kernel_name = getattr(op_overload, "_name", str(op_overload)) + if kernel_name not in supported_fallback_kernels: + missing_fallback_kernels.add(kernel_name) + + original_generate_fallback_kernel_with_runtime_lookup_aot( + self, op_overload, raw_args, output_args, raw_outputs + ) + + CppWrapperCpu.generate_c_shim_extern_kernel_call = ( + generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels + ) + CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot = ( + generate_fallback_kernel_with_runtime_lookup_aot_and_collect_unsupported_kernels + ) + try: + yield + finally: + CppWrapperCpu.generate_c_shim_extern_kernel_call = ( + original_generate_c_shim_extern_kernel_call + ) + CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot = ( + original_generate_fallback_kernel_with_runtime_lookup_aot + ) + + +@final +@experimental( + "This API and all of cuda backend related functionality are experimental." +) +class CudaBackend(BackendDetails): + """ + CudaBackend is a backend that compiles a model to run on CUDA devices. It uses the AOTInductor compiler to generate + optimized CUDA kernels for the model's operators with libtorch-free. The compiled model can be executed on CUDA devices + using the Executorch runtime. + """ + + @staticmethod + def preprocess( + edge_program: ExportedProgram, + compile_specs: List[CompileSpec], + ) -> PreprocessResult: + # Move the edge_program from CPU to CUDA for aoti compile + cuda_edge_program = move_to_device_pass(edge_program, "cuda") + + edge_program_module = cuda_edge_program.module() + + # Grab all input placeholders from the graph + user_input_names = cuda_edge_program.graph_signature.user_inputs + user_input_placeholders = [] + for node in cuda_edge_program.graph.nodes: + if node.op == "placeholder" and node.name in user_input_names: + user_input_placeholders.append(node.meta["val"]) + + # Create pseudo user inputs using torch.randn and metadata from input placeholders + faked_user_inputs = [] + for placeholder in user_input_placeholders: + if isinstance(placeholder, torch.Tensor): + # Generate fake input with same shape and dtype, on CUDA + fake_input = torch.randn( + placeholder.shape, dtype=placeholder.dtype, device="cuda" + ) + faked_user_inputs.append(fake_input) + + faked_user_inputs = tuple(faked_user_inputs) + + options: dict[str, typing.Any] = { + # Embed CUDA kernel binaries directly into the compiled shared object + "aot_inductor.embed_kernel_binary": True, + # Do not link against the full PyTorch/libtorch library + "aot_inductor.link_libtorch": False, + # Package model constants and other generated files directly in the shared object (.so) file + "aot_inductor.package_constants_in_so": True, + # Enable maximum automatic tuning for optimal performance + "max_autotune": True, + # Use TRITON for GEMM (General Matrix Multiply) operations tuning only to avoid using operators in libtorch + "max_autotune_gemm_backends": "TRITON", + # Use TRITON backend for convolution operations tuning only to avoid using operators in libtorch + "max_autotune_conv_backends": "TRITON", + } + + with collect_unsupported_fallback_kernels(): + so_path = torch._inductor.aot_compile(edge_program_module, faked_user_inputs, options=options) # type: ignore[arg-type] + if len(missing_fallback_kernels) > 0: + formatted_kernels = "\n - ".join(sorted(missing_fallback_kernels)) + raise RuntimeError( + f"Missing fallback kernels ({len(missing_fallback_kernels)} total):\n - {formatted_kernels}\n" + "Please add them to the AOTI backend." + ) + + # pyre-ignorep[6]: Incompatible parameter type + with open(so_path, "rb") as f: + so_data = f.read() + + named_data_store = NamedDataStore() + named_data_store.add_named_data("so_blob", so_data, 1, "aoti_cuda_blob") + + # Clean up the generated so file; it has been packaged into the NamdeDataStore + # pyre-ignorep[6]: Incompatible parameter type + os.remove(so_path) + + return PreprocessResult( + processed_bytes=b"", + debug_handle_map={}, + data_store_output=named_data_store.get_named_data_store_output(), + ) diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py index cf22b0dea81..d52d7d3d087 100644 --- a/backends/cuda/cuda_partitioner.py +++ b/backends/cuda/cuda_partitioner.py @@ -7,6 +7,8 @@ from typing import Callable, Dict, final, List, Optional, Tuple import torch +from executorch.backends.cuda.cuda_backend import CudaBackend # usort: skip +from executorch.exir._warnings import experimental from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.backend.partitioner import ( DelegationSpec, @@ -18,6 +20,9 @@ @final +@experimental( + "This API and all of cuda backend related functionality are experimental." +) class CudaPartitioner(Partitioner): """ CUDA partitioner for AOTInductor backend integration. @@ -31,7 +36,7 @@ class CudaPartitioner(Partitioner): """ def __init__(self, compile_spec: List[CompileSpec]) -> None: - self.delegation_spec = DelegationSpec("CudaBackend", compile_spec) + self.delegation_spec = DelegationSpec(CudaBackend.__name__, compile_spec) def partition(self, exported_program: ExportedProgram) -> PartitionResult: """ diff --git a/backends/cuda/tests/TARGETS b/backends/cuda/tests/TARGETS index c775cf2fec2..12718c04388 100644 --- a/backends/cuda/tests/TARGETS +++ b/backends/cuda/tests/TARGETS @@ -1,8 +1,28 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") +load("@fbcode_macros//build_defs:python_unittest_remote_gpu.bzl", "python_unittest_remote_gpu") oncall("executorch") +python_unittest_remote_gpu( + name = "test_cuda_export", + srcs = [ + "test_cuda_export.py", + ], + visibility = [ + "//executorch/...", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/cuda:cuda_backend", + "//executorch/backends/cuda:cuda_partitioner", + "//executorch/exir:lib", + "//executorch/exir/backend:backend_api", + "//executorch/exir/backend:compile_spec_schema", + ], + keep_gpu_sections = True, +) + python_unittest( name = "test_cuda_partitioner", srcs = [ @@ -14,6 +34,7 @@ python_unittest( deps = [ "//caffe2:torch", "//executorch/backends/cuda:cuda_partitioner", + "//executorch/backends/cuda:cuda_backend", "//executorch/exir:lib", "//executorch/exir/backend:compile_spec_schema", ], diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py new file mode 100644 index 00000000000..99f8d33a766 --- /dev/null +++ b/backends/cuda/tests/test_cuda_export.py @@ -0,0 +1,250 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from typing import Tuple + +import torch +from executorch.backends.cuda.cuda_partitioner import CudaPartitioner +from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower +from torch.export import export + + +class TestCudaExport(unittest.TestCase): + """Test CUDA export functionality for various operations using to_edge_transform_and_lower.""" + + def setUp(self): + """Set up test environment.""" + # Skip tests if CUDA is not available + if not torch.cuda.is_available(): + self.skipTest("CUDA is not available") + + def _export_to_cuda_with_lower( + self, module: torch.nn.Module, inputs: Tuple[torch.Tensor, ...] + ) -> None: + """Helper method to export a module to CUDA backend using to_edge_transform_and_lower.""" + # Export the model + exported_program = export(module, inputs, strict=True) + + # Create partitioner and compile specs + partitioner = CudaPartitioner([]) + + # Use to_edge_transform_and_lower for complete pipeline + edge_program_manager = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + compile_config=EdgeCompileConfig( + _check_ir_validity=False, + ), + ) + + # Verify that the pipeline succeeded + self.assertIsNotNone(edge_program_manager) + self.assertTrue(hasattr(edge_program_manager, "exported_program")) + + # Verify that the final exported program contains delegated calls + exported_program = edge_program_manager.exported_program() + has_delegate_call = False + for node in exported_program.graph.nodes: + if node.op == "call_function" and "executorch_call_delegate" in str( + node.target + ): + has_delegate_call = True + break + + self.assertTrue( + has_delegate_call, "No delegate calls found in final exported program" + ) + + return edge_program_manager + + def test_simple_add(self): + """Test CUDA export for simple element-wise addition.""" + + class AddModule(torch.nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + return x + y + + module = AddModule() + module.eval() + inputs = (torch.randn(3, 4), torch.randn(3, 4)) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone(edge_program_manager, "Simple add operation export failed") + + def test_conv2d(self): + """Test CUDA export for 2D convolution.""" + + class Conv2dModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 16, kernel_size=3, padding=1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.conv(x) + + module = Conv2dModule() + module.eval() + inputs = (torch.randn(1, 3, 32, 32),) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone(edge_program_manager, "Conv2d operation export failed") + + def test_linear(self): + """Test CUDA export for linear layer.""" + + class LinearModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(128, 64) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.linear(x) + + module = LinearModule() + module.eval() + inputs = (torch.randn(8, 128),) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone(edge_program_manager, "Linear operation export failed") + + def test_resnet_block(self): + """Test CUDA export for a ResNet-style block.""" + + class ResNetBlock(torch.nn.Module): + def __init__(self, in_channels: int, out_channels: int, stride: int = 1): + super().__init__() + self.conv1 = torch.nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + ) + # Use eval mode to avoid batch norm mutations during export + self.bn1 = torch.nn.BatchNorm2d(out_channels) + self.relu = torch.nn.ReLU(inplace=True) + self.conv2 = torch.nn.Conv2d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + self.bn2 = torch.nn.BatchNorm2d(out_channels) + + # Shortcut connection + self.shortcut = torch.nn.Sequential() + if stride != 1 or in_channels != out_channels: + self.shortcut = torch.nn.Sequential( + torch.nn.Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + ), + torch.nn.BatchNorm2d(out_channels), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + identity = self.shortcut(x) + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + out += identity + out = self.relu(out) + + return out + + module = ResNetBlock(64, 64) + # Set module to eval mode to avoid batch norm running statistics mutations + module.eval() + inputs = (torch.randn(1, 64, 32, 32),) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone(edge_program_manager, "ResNet block export failed") + + def test_multi_operation_module(self): + """Test CUDA export for a module with multiple operations.""" + + class MultiOpModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 32, kernel_size=3, padding=1) + self.relu = torch.nn.ReLU() + self.pool = torch.nn.AdaptiveAvgPool2d((1, 1)) + self.linear = torch.nn.Linear(32, 10) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv(x) + x = self.relu(x) + x = self.pool(x) + x = x.view(x.size(0), -1) + x = self.linear(x) + return x + + module = MultiOpModule() + module.eval() + inputs = (torch.randn(2, 3, 16, 16),) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone( + edge_program_manager, "Multi-operation module export failed" + ) + + def test_activation_functions(self): + """Test CUDA export for various activation functions.""" + + class ActivationModule(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + # Test multiple activation functions + x1 = torch.relu(x) + x2 = torch.sigmoid(x) + x3 = torch.tanh(x) + return x1 + x2 + x3 + + module = ActivationModule() + module.eval() + inputs = (torch.randn(4, 8),) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone(edge_program_manager, "Activation functions export failed") + + def test_mathematical_operations(self): + """Test CUDA export for mathematical operations.""" + + class MathOpsModule(torch.nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + # Test various mathematical operations + add_result = x + y + mul_result = x * y + sub_result = x - y + div_result = x / (y + 1e-8) # Add epsilon to avoid division by zero + return add_result + mul_result + sub_result + div_result + + module = MathOpsModule() + module.eval() + inputs = (torch.randn(4, 4), torch.randn(4, 4)) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone( + edge_program_manager, "Mathematical operations export failed" + ) diff --git a/backends/cuda/tests/test_cuda_partitioner.py b/backends/cuda/tests/test_cuda_partitioner.py index 586d6f14494..cb4a2def1f8 100644 --- a/backends/cuda/tests/test_cuda_partitioner.py +++ b/backends/cuda/tests/test_cuda_partitioner.py @@ -9,7 +9,6 @@ import torch from executorch.backends.cuda.cuda_partitioner import CudaPartitioner -from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.backend.partitioner import PartitionResult from torch.export import export @@ -31,8 +30,7 @@ def _get_partition_result( exported_program = export(module, inputs, strict=True) # Create partitioner and compile specs - compile_specs = [CompileSpec("cuda_compile_options", b"")] - partitioner = CudaPartitioner(compile_specs) + partitioner = CudaPartitioner([]) # Get partition result partition_result = partitioner.partition(exported_program)