pytorch
diff --git a/‎backends/aoti/aoti_delegate_handle.h‎
Lines changed: 12 additions & 0 deletions b/‎backends/aoti/aoti_delegate_handle.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/aoti/common_shims.cpp‎
Lines changed: 0 additions & 268 deletions b/‎backends/aoti/common_shims.cpp‎
Lines changed: 0 additions & 268 deletions
diff --git a/‎backends/aoti/slim/c10/cuda/Exception.h‎
Lines changed: 2 additions & 0 deletions b/‎backends/aoti/slim/c10/cuda/Exception.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/aoti/targets.bzl‎
Lines changed: 6 additions & 28 deletions b/‎backends/aoti/targets.bzl‎
Lines changed: 6 additions & 28 deletions
diff --git a/‎backends/aoti/tests/TARGETS‎
Lines changed: 0 additions & 19 deletions b/‎backends/aoti/tests/TARGETS‎
Lines changed: 0 additions & 19 deletions
@@ -11,6 +11,11 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <string>
+#include <vector>
+
+#ifdef CUDA_AVAILABLE
+#include <executorch/backends/aoti/slim/core/slim_tensor.h>
+#endif
 
 namespace executorch {
 namespace backends {
@@ -95,6 +100,13 @@ struct AOTIDelegateHandle {
   AOTInductorModelContainerGetNumOutputsFunc get_num_outputs;
   AOTInductorModelContainerRunFunc run;
   AOTInductorModelUpdateConstantsFromBlobFunc update_constants_from_blob;
+
+#ifdef CUDA_AVAILABLE
+  // Cached output tensors for skip-copy optimization.
+  // When copy-skip is enabled, output SlimTensors are cached here to keep
+  // GPU memory alive while the caller processes the results.
+  std::vector<slim::SlimTensor> cached_outputs;
+#endif
 };
 
 } // namespace aoti
 
@@ -19,12 +19,14 @@
 
 /// Checks a CUDA expression and aborts on error.
 /// @param EXPR The CUDA expression to check.
+#ifndef ET_CUDA_CHECK
 #define ET_CUDA_CHECK(EXPR)                                                 \
   do {                                                                      \
     const cudaError_t __err = EXPR;                                         \
     ET_CHECK_MSG(                                                           \
         __err == cudaSuccess, "CUDA error: %s", cudaGetErrorString(__err)); \
   } while (0)
+#endif
 
 /// Checks a CUDA expression and logs a warning on error (non-fatal).
 /// @param EXPR The CUDA expression to check.
 
@@ -33,26 +33,22 @@ def define_common_targets():
         ],
     )
 
-    # AOTI common shims functionality
+    # AOTI common shims functionality (header-only library)
+    # The caller determines which tensor type is used by defining CUDA_AVAILABLE.
+    # - With CUDA_AVAILABLE=1: Uses SlimTensor
+    # - Without CUDA_AVAILABLE: Uses ETensor
     runtime.cxx_library(
         name = "common_shims",
-        srcs = [
-            "common_shims.cpp",
-        ],
         headers = [
             "common_shims.h",
             "export.h",
             "utils.h",
         ],
-        # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
-        link_whole = True,
-        supports_python_dlopen = True,
-        # Constructor needed for backend registration.
-        compiler_flags = ["-Wno-global-constructors"],
         visibility = ["PUBLIC"],
-        deps = [
+        exported_deps = [
             "//executorch/runtime/core:core",
             "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/backends/aoti/slim/core:slimtensor",
         ],
     )
 
@@ -86,21 +82,3 @@ def define_common_targets():
             ":delegate_handle",
         ],
     )
-
-    # SlimTensor-based common shims (header-only library)
-    # The caller determines which tensor type is used by defining CUDA_AVAILABLE.
-    # - With CUDA_AVAILABLE=1: Uses SlimTensor
-    # - Without CUDA_AVAILABLE: Uses ETensor
-    runtime.cxx_library(
-        name = "common_shims_slim",
-        headers = [
-            "common_shims_slim.h",
-            "export.h",
-        ],
-        visibility = ["@EXECUTORCH_CLIENTS"],
-        deps = [
-            "//executorch/runtime/core:core",
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/backends/aoti/slim/core:slimtensor",
-        ],
-    )
@@ -8,27 +8,8 @@ cpp_unittest(
     srcs = [
         "test_common_shims.cpp",
     ],
-    headers = [
-        "utils.h",
-    ],
     deps = [
         "//executorch/backends/aoti:common_shims",
-        "//executorch/extension/tensor:tensor",
-        "//executorch/runtime/core:core",
-        "//executorch/runtime/platform:platform",
-        "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
-        "//executorch/runtime/core/exec_aten:lib",
-        "//executorch/extension/tensor:tensor",
-    ],
-)
-
-cpp_unittest(
-    name = "test_common_shims_slim",
-    srcs = [
-        "test_common_shims_slim.cpp",
-    ],
-    deps = [
-        "//executorch/backends/aoti:common_shims_slim",
         "//executorch/backends/aoti/slim/core:slimtensor",
         "//executorch/backends/aoti/slim/factory:empty",
         "//executorch/runtime/core:core",