diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS
index f54a95229c6..3e412b6dc56 100644
--- a/backends/cuda/TARGETS
+++ b/backends/cuda/TARGETS
@@ -2,6 +2,22 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 oncall("executorch")
 
+runtime.python_library(
+    name = "cuda_backend",
+    srcs = [
+        "cuda_backend.py",
+    ],
+    visibility = [
+        "//executorch/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir/_serialize:lib",
+        "//executorch/exir/backend:backend_details",
+        "//executorch/exir/backend:compile_spec_schema",
+    ],
+)
+
 runtime.python_library(
     name = "cuda_partitioner",
     srcs = [
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
new file mode 100644
index 00000000000..1942d5e24a3
--- /dev/null
+++ b/backends/cuda/cuda_backend.py
@@ -0,0 +1,171 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import os
+import typing
+
+from typing import Any, Dict, final, List, Optional, Set
+
+import torch
+from executorch.exir._serialize._named_data_store import NamedDataStore
+from executorch.exir._warnings import experimental
+from executorch.exir.backend.backend_details import (
+    BackendDetails,
+    ExportedProgram,
+    PreprocessResult,
+)
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+from torch.export.passes import move_to_device_pass
+
+
+# exist fallback operators in et namespace;
+supported_fallback_kernels: Dict[str, Any] = {}
+
+# required fallback kernels but not supported
+missing_fallback_kernels: Set[str] = set()
+
+
+# context manager for non-fallback guarantee
+# it will raise exception when generating fallback kernels during aoti compile
+@contextlib.contextmanager
+def collect_unsupported_fallback_kernels():
+    original_generate_c_shim_extern_kernel_call = (
+        CppWrapperCpu.generate_c_shim_extern_kernel_call
+    )
+    original_generate_fallback_kernel_with_runtime_lookup_aot = (
+        CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot
+    )
+
+    def generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels(
+        self,
+        kernel: str,
+        args: list[str],
+        device: str,
+        *,
+        debug_args: Optional[list[str]] = None,
+    ):
+        if kernel not in supported_fallback_kernels:
+            missing_fallback_kernels.add(kernel)
+
+        original_generate_c_shim_extern_kernel_call(
+            self, kernel, args, device, debug_args=debug_args
+        )
+
+    def generate_fallback_kernel_with_runtime_lookup_aot_and_collect_unsupported_kernels(
+        self,
+        op_overload,
+        raw_args,
+        output_args,
+        raw_outputs,
+    ):
+        # Extract kernel name for collection
+        kernel_name = getattr(op_overload, "_name", str(op_overload))
+        if kernel_name not in supported_fallback_kernels:
+            missing_fallback_kernels.add(kernel_name)
+
+        original_generate_fallback_kernel_with_runtime_lookup_aot(
+            self, op_overload, raw_args, output_args, raw_outputs
+        )
+
+    CppWrapperCpu.generate_c_shim_extern_kernel_call = (
+        generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels
+    )
+    CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot = (
+        generate_fallback_kernel_with_runtime_lookup_aot_and_collect_unsupported_kernels
+    )
+    try:
+        yield
+    finally:
+        CppWrapperCpu.generate_c_shim_extern_kernel_call = (
+            original_generate_c_shim_extern_kernel_call
+        )
+        CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot = (
+            original_generate_fallback_kernel_with_runtime_lookup_aot
+        )
+
+
+@final
+@experimental(
+    "This API and all of cuda backend related functionality are experimental."
+)
+class CudaBackend(BackendDetails):
+    """
+    CudaBackend is a backend that compiles a model to run on CUDA devices. It uses the AOTInductor compiler to generate
+    optimized CUDA kernels for the model's operators with libtorch-free. The compiled model can be executed on CUDA devices
+    using the Executorch runtime.
+    """
+
+    @staticmethod
+    def preprocess(
+        edge_program: ExportedProgram,
+        compile_specs: List[CompileSpec],
+    ) -> PreprocessResult:
+        # Move the edge_program from CPU to CUDA for aoti compile
+        cuda_edge_program = move_to_device_pass(edge_program, "cuda")
+
+        edge_program_module = cuda_edge_program.module()
+
+        # Grab all input placeholders from the graph
+        user_input_names = cuda_edge_program.graph_signature.user_inputs
+        user_input_placeholders = []
+        for node in cuda_edge_program.graph.nodes:
+            if node.op == "placeholder" and node.name in user_input_names:
+                user_input_placeholders.append(node.meta["val"])
+
+        # Create pseudo user inputs using torch.randn and metadata from input placeholders
+        faked_user_inputs = []
+        for placeholder in user_input_placeholders:
+            if isinstance(placeholder, torch.Tensor):
+                # Generate fake input with same shape and dtype, on CUDA
+                fake_input = torch.randn(
+                    placeholder.shape, dtype=placeholder.dtype, device="cuda"
+                )
+                faked_user_inputs.append(fake_input)
+
+        faked_user_inputs = tuple(faked_user_inputs)
+
+        options: dict[str, typing.Any] = {
+            # Embed CUDA kernel binaries directly into the compiled shared object
+            "aot_inductor.embed_kernel_binary": True,
+            # Do not link against the full PyTorch/libtorch library
+            "aot_inductor.link_libtorch": False,
+            # Package model constants and other generated files directly in the shared object (.so) file
+            "aot_inductor.package_constants_in_so": True,
+            # Enable maximum automatic tuning for optimal performance
+            "max_autotune": True,
+            # Use TRITON for GEMM (General Matrix Multiply) operations tuning only to avoid using operators in libtorch
+            "max_autotune_gemm_backends": "TRITON",
+            # Use TRITON backend for convolution operations tuning only to avoid using operators in libtorch
+            "max_autotune_conv_backends": "TRITON",
+        }
+
+        with collect_unsupported_fallback_kernels():
+            so_path = torch._inductor.aot_compile(edge_program_module, faked_user_inputs, options=options)  # type: ignore[arg-type]
+            if len(missing_fallback_kernels) > 0:
+                formatted_kernels = "\n  - ".join(sorted(missing_fallback_kernels))
+                raise RuntimeError(
+                    f"Missing fallback kernels ({len(missing_fallback_kernels)} total):\n  - {formatted_kernels}\n"
+                    "Please add them to the AOTI backend."
+                )
+
+        # pyre-ignorep[6]: Incompatible parameter type
+        with open(so_path, "rb") as f:
+            so_data = f.read()
+
+        named_data_store = NamedDataStore()
+        named_data_store.add_named_data("so_blob", so_data, 1, "aoti_cuda_blob")
+
+        # Clean up the generated so file; it has been packaged into the NamdeDataStore
+        # pyre-ignorep[6]: Incompatible parameter type
+        os.remove(so_path)
+
+        return PreprocessResult(
+            processed_bytes=b"",
+            debug_handle_map={},
+            data_store_output=named_data_store.get_named_data_store_output(),
+        )
diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
index cf22b0dea81..d52d7d3d087 100644
--- a/backends/cuda/cuda_partitioner.py
+++ b/backends/cuda/cuda_partitioner.py
@@ -7,6 +7,8 @@
 from typing import Callable, Dict, final, List, Optional, Tuple
 
 import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend  # usort: skip
+from executorch.exir._warnings import experimental
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import (
     DelegationSpec,
@@ -18,6 +20,9 @@
 
 
 @final
+@experimental(
+    "This API and all of cuda backend related functionality are experimental."
+)
 class CudaPartitioner(Partitioner):
     """
     CUDA partitioner for AOTInductor backend integration.
@@ -31,7 +36,7 @@ class CudaPartitioner(Partitioner):
     """
 
     def __init__(self, compile_spec: List[CompileSpec]) -> None:
-        self.delegation_spec = DelegationSpec("CudaBackend", compile_spec)
+        self.delegation_spec = DelegationSpec(CudaBackend.__name__, compile_spec)
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         """
diff --git a/backends/cuda/tests/TARGETS b/backends/cuda/tests/TARGETS
index c775cf2fec2..12718c04388 100644
--- a/backends/cuda/tests/TARGETS
+++ b/backends/cuda/tests/TARGETS
@@ -1,8 +1,28 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+load("@fbcode_macros//build_defs:python_unittest_remote_gpu.bzl", "python_unittest_remote_gpu")
 
 oncall("executorch")
 
+python_unittest_remote_gpu(
+    name = "test_cuda_export",
+    srcs = [
+        "test_cuda_export.py",
+    ],
+    visibility = [
+        "//executorch/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/cuda:cuda_backend",
+        "//executorch/backends/cuda:cuda_partitioner",
+        "//executorch/exir:lib",
+        "//executorch/exir/backend:backend_api",
+        "//executorch/exir/backend:compile_spec_schema",
+    ],
+    keep_gpu_sections = True,
+)
+
 python_unittest(
     name = "test_cuda_partitioner",
     srcs = [
@@ -14,6 +34,7 @@ python_unittest(
     deps = [
         "//caffe2:torch",
         "//executorch/backends/cuda:cuda_partitioner",
+        "//executorch/backends/cuda:cuda_backend",
         "//executorch/exir:lib",
         "//executorch/exir/backend:compile_spec_schema",
     ],
diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py
new file mode 100644
index 00000000000..99f8d33a766
--- /dev/null
+++ b/backends/cuda/tests/test_cuda_export.py
@@ -0,0 +1,250 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Tuple
+
+import torch
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
+from torch.export import export
+
+
+class TestCudaExport(unittest.TestCase):
+    """Test CUDA export functionality for various operations using to_edge_transform_and_lower."""
+
+    def setUp(self):
+        """Set up test environment."""
+        # Skip tests if CUDA is not available
+        if not torch.cuda.is_available():
+            self.skipTest("CUDA is not available")
+
+    def _export_to_cuda_with_lower(
+        self, module: torch.nn.Module, inputs: Tuple[torch.Tensor, ...]
+    ) -> None:
+        """Helper method to export a module to CUDA backend using to_edge_transform_and_lower."""
+        # Export the model
+        exported_program = export(module, inputs, strict=True)
+
+        # Create partitioner and compile specs
+        partitioner = CudaPartitioner([])
+
+        # Use to_edge_transform_and_lower for complete pipeline
+        edge_program_manager = to_edge_transform_and_lower(
+            exported_program,
+            partitioner=[partitioner],
+            compile_config=EdgeCompileConfig(
+                _check_ir_validity=False,
+            ),
+        )
+
+        # Verify that the pipeline succeeded
+        self.assertIsNotNone(edge_program_manager)
+        self.assertTrue(hasattr(edge_program_manager, "exported_program"))
+
+        # Verify that the final exported program contains delegated calls
+        exported_program = edge_program_manager.exported_program()
+        has_delegate_call = False
+        for node in exported_program.graph.nodes:
+            if node.op == "call_function" and "executorch_call_delegate" in str(
+                node.target
+            ):
+                has_delegate_call = True
+                break
+
+        self.assertTrue(
+            has_delegate_call, "No delegate calls found in final exported program"
+        )
+
+        return edge_program_manager
+
+    def test_simple_add(self):
+        """Test CUDA export for simple element-wise addition."""
+
+        class AddModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return x + y
+
+        module = AddModule()
+        module.eval()
+        inputs = (torch.randn(3, 4), torch.randn(3, 4))
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(edge_program_manager, "Simple add operation export failed")
+
+    def test_conv2d(self):
+        """Test CUDA export for 2D convolution."""
+
+        class Conv2dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 16, kernel_size=3, padding=1)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.conv(x)
+
+        module = Conv2dModule()
+        module.eval()
+        inputs = (torch.randn(1, 3, 32, 32),)
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(edge_program_manager, "Conv2d operation export failed")
+
+    def test_linear(self):
+        """Test CUDA export for linear layer."""
+
+        class LinearModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(128, 64)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.linear(x)
+
+        module = LinearModule()
+        module.eval()
+        inputs = (torch.randn(8, 128),)
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(edge_program_manager, "Linear operation export failed")
+
+    def test_resnet_block(self):
+        """Test CUDA export for a ResNet-style block."""
+
+        class ResNetBlock(torch.nn.Module):
+            def __init__(self, in_channels: int, out_channels: int, stride: int = 1):
+                super().__init__()
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    bias=False,
+                )
+                # Use eval mode to avoid batch norm mutations during export
+                self.bn1 = torch.nn.BatchNorm2d(out_channels)
+                self.relu = torch.nn.ReLU(inplace=True)
+                self.conv2 = torch.nn.Conv2d(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                )
+                self.bn2 = torch.nn.BatchNorm2d(out_channels)
+
+                # Shortcut connection
+                self.shortcut = torch.nn.Sequential()
+                if stride != 1 or in_channels != out_channels:
+                    self.shortcut = torch.nn.Sequential(
+                        torch.nn.Conv2d(
+                            in_channels,
+                            out_channels,
+                            kernel_size=1,
+                            stride=stride,
+                            bias=False,
+                        ),
+                        torch.nn.BatchNorm2d(out_channels),
+                    )
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                identity = self.shortcut(x)
+
+                out = self.conv1(x)
+                out = self.bn1(out)
+                out = self.relu(out)
+
+                out = self.conv2(out)
+                out = self.bn2(out)
+
+                out += identity
+                out = self.relu(out)
+
+                return out
+
+        module = ResNetBlock(64, 64)
+        # Set module to eval mode to avoid batch norm running statistics mutations
+        module.eval()
+        inputs = (torch.randn(1, 64, 32, 32),)
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(edge_program_manager, "ResNet block export failed")
+
+    def test_multi_operation_module(self):
+        """Test CUDA export for a module with multiple operations."""
+
+        class MultiOpModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 32, kernel_size=3, padding=1)
+                self.relu = torch.nn.ReLU()
+                self.pool = torch.nn.AdaptiveAvgPool2d((1, 1))
+                self.linear = torch.nn.Linear(32, 10)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                x = self.conv(x)
+                x = self.relu(x)
+                x = self.pool(x)
+                x = x.view(x.size(0), -1)
+                x = self.linear(x)
+                return x
+
+        module = MultiOpModule()
+        module.eval()
+        inputs = (torch.randn(2, 3, 16, 16),)
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(
+            edge_program_manager, "Multi-operation module export failed"
+        )
+
+    def test_activation_functions(self):
+        """Test CUDA export for various activation functions."""
+
+        class ActivationModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                # Test multiple activation functions
+                x1 = torch.relu(x)
+                x2 = torch.sigmoid(x)
+                x3 = torch.tanh(x)
+                return x1 + x2 + x3
+
+        module = ActivationModule()
+        module.eval()
+        inputs = (torch.randn(4, 8),)
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(edge_program_manager, "Activation functions export failed")
+
+    def test_mathematical_operations(self):
+        """Test CUDA export for mathematical operations."""
+
+        class MathOpsModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                # Test various mathematical operations
+                add_result = x + y
+                mul_result = x * y
+                sub_result = x - y
+                div_result = x / (y + 1e-8)  # Add epsilon to avoid division by zero
+                return add_result + mul_result + sub_result + div_result
+
+        module = MathOpsModule()
+        module.eval()
+        inputs = (torch.randn(4, 4), torch.randn(4, 4))
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(
+            edge_program_manager, "Mathematical operations export failed"
+        )
diff --git a/backends/cuda/tests/test_cuda_partitioner.py b/backends/cuda/tests/test_cuda_partitioner.py
index 586d6f14494..cb4a2def1f8 100644
--- a/backends/cuda/tests/test_cuda_partitioner.py
+++ b/backends/cuda/tests/test_cuda_partitioner.py
@@ -9,7 +9,6 @@
 
 import torch
 from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
-from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import PartitionResult
 from torch.export import export
 
@@ -31,8 +30,7 @@ def _get_partition_result(
         exported_program = export(module, inputs, strict=True)
 
         # Create partitioner and compile specs
-        compile_specs = [CompileSpec("cuda_compile_options", b"")]
-        partitioner = CudaPartitioner(compile_specs)
+        partitioner = CudaPartitioner([])
 
         # Get partition result
         partition_result = partitioner.partition(exported_program)