diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index d5582dfe7c7..121f4b60418 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -25,34 +25,25 @@ endif()
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
-# Common AOTI functionality - combines all AOTI common components
-set(_aoti_common_sources common_shims.cpp)
-add_library(aoti_common STATIC ${_aoti_common_sources})
+# Common AOTI functionality - header-only library for common shims
+add_library(aoti_common INTERFACE)
 target_include_directories(
   aoti_common
-  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
-         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+  INTERFACE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+            $<INSTALL_INTERFACE:include>
+            $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
 )
 target_compile_options(
   aoti_common
-  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
-         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+  INTERFACE $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+            $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
 )
 target_compile_definitions(
-  aoti_common PRIVATE $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
+  aoti_common INTERFACE $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
 )
-# Ensure symbols are exported properly
-if(APPLE)
-  target_link_options(aoti_common PUBLIC -Wl,-export_dynamic)
-else()
-  target_link_options(
-    aoti_common PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
-  )
-endif()
 
 # Link against ExecuTorch libraries and standard libraries
-target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
-executorch_target_link_options_shared_lib(aoti_common)
+target_link_libraries(aoti_common INTERFACE extension_tensor ${CMAKE_DL_LIBS})
 
 install(
   TARGETS aoti_common
diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h
index b14e02da9ef..7447292e5d9 100644
--- a/backends/aoti/aoti_delegate_handle.h
+++ b/backends/aoti/aoti_delegate_handle.h
@@ -11,6 +11,11 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <string>
+#include <vector>
+
+#ifdef CUDA_AVAILABLE
+#include <executorch/backends/aoti/slim/core/slim_tensor.h>
+#endif
 
 namespace executorch {
 namespace backends {
@@ -95,6 +100,7 @@ struct AOTIDelegateHandle {
   AOTInductorModelContainerGetNumOutputsFunc get_num_outputs;
   AOTInductorModelContainerRunFunc run;
   AOTInductorModelUpdateConstantsFromBlobFunc update_constants_from_blob;
+
 };
 
 } // namespace aoti
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
deleted file mode 100644
index abfde86db6d..00000000000
--- a/backends/aoti/common_shims.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/runtime/platform/log.h>
-#include <cstdint>
-
-namespace executorch {
-namespace backends {
-namespace aoti {
-
-namespace internal {
-// Global storage for tensor metadata
-AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
-    tensor_to_sizes;
-AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
-    tensor_to_strides;
-} // namespace internal
-
-extern "C" {
-
-// Autograd mode functions
-int32_t aoti_torch_grad_mode_is_enabled() {
-  // No autograd ever
-  return false;
-}
-
-void aoti_torch_grad_mode_set_enabled(bool enabled) {
-  if (enabled) {
-    throw std::runtime_error("Cannot enable autograd");
-  }
-}
-
-// Tensor attribute operations
-AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr) {
-  *ret_data_ptr = tensor->mutable_data_ptr();
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_get_storage_offset(
-    Tensor* tensor,
-    int64_t* ret_storage_offset) {
-  // Storage offset is always 0 in ET
-  *ret_storage_offset = 0;
-
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
-  auto it = internal::tensor_to_strides.find(tensor);
-  bool needs_update = false;
-
-  if (it == internal::tensor_to_strides.end()) {
-    needs_update = true;
-  } else {
-    // CRITICAL: Multimodal models reuse tensors with different shapes across
-    // executions (e.g., variable-length audio). We MUST validate cached
-    // metadata matches current tensor state, or CUDA kernels will receive
-    // incorrect shapes leading to memory corruption and segfaults.
-    auto tensor_strides = tensor->strides();
-    needs_update = !std::equal(
-        it->second.begin(),
-        it->second.end(),
-        tensor_strides.begin(),
-        tensor_strides.end());
-  }
-
-  if (needs_update) {
-    std::vector<int64_t> strides(tensor->dim());
-    auto tensor_strides = tensor->strides();
-    for (int i = 0; i < tensor->dim(); i++) {
-      strides[i] = tensor_strides[i];
-    }
-    it =
-        internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
-            .first;
-  }
-
-  // For 0D tensors, data() returns nullptr on empty vectors, but we need to
-  // return a valid pointer
-  if (it->second.empty()) {
-    static int64_t empty_strides_placeholder = 0;
-    *ret_strides = &empty_strides_placeholder;
-  } else {
-    *ret_strides = it->second.data();
-  }
-
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
-  *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
-
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
-  auto it = internal::tensor_to_sizes.find(tensor);
-  bool needs_update = false;
-
-  if (it == internal::tensor_to_sizes.end()) {
-    needs_update = true;
-  } else {
-    // CRITICAL: Multimodal models reuse tensors with different shapes across
-    // executions (e.g., variable-length audio). We MUST validate cached
-    // metadata matches current tensor state, or CUDA kernels will receive
-    // incorrect shapes leading to memory corruption and segfaults.
-    auto tensor_sizes = tensor->sizes();
-    needs_update = !std::equal(
-        it->second.begin(),
-        it->second.end(),
-        tensor_sizes.begin(),
-        tensor_sizes.end());
-  }
-
-  if (needs_update) {
-    std::vector<int64_t> sizes(tensor->dim());
-    auto tensor_sizes = tensor->sizes();
-    for (int i = 0; i < tensor->dim(); i++) {
-      sizes[i] = tensor_sizes[i];
-    }
-    it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
-             .first;
-  }
-
-  // For 0D tensors, data() returns nullptr on empty vectors, but we need to
-  // return a valid pointer
-  if (it->second.empty()) {
-    static int64_t empty_sizes_placeholder = 0;
-    *ret_sizes = &empty_sizes_placeholder;
-  } else {
-    *ret_sizes = it->second.data();
-  }
-
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_get_device_index(
-    Tensor* tensor,
-    int32_t* ret_device_index) {
-  // Let's assume all tensors AOTI using are on CUDA:0
-  *ret_device_index = 0;
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim) {
-  *ret_dim = static_cast<int64_t>(tensor->dim());
-  return Error::Ok;
-}
-
-// Device and layout utility functions
-int32_t aoti_torch_device_type_cpu() {
-  // Let's say cpu is 0 for ET as well
-  return 0;
-}
-
-int32_t aoti_torch_layout_strided() {
-  // ET only support strided layout, the return value will always be 0, a.k.a
-  // at::Layout::Strided;
-  return 0;
-}
-
-// Dtype constants - these return the PyTorch dtype codes
-int32_t aoti_torch_dtype_float32() {
-  return 6; // PyTorch's float32 dtype code
-}
-
-int32_t aoti_torch_dtype_bfloat16() {
-  return 15; // PyTorch's bfloat16 dtype code
-}
-
-int32_t aoti_torch_dtype_int8() {
-  return 1; // PyTorch's int32 dtype code
-}
-
-int32_t aoti_torch_dtype_int16() {
-  return 2; // PyTorch's int32 dtype code
-}
-
-int32_t aoti_torch_dtype_int32() {
-  return 3; // PyTorch's int32 dtype code
-}
-
-int32_t aoti_torch_dtype_bool() {
-  return 11; // PyTorch's bool dtype code
-}
-
-int32_t aoti_torch_dtype_int64() {
-  return 4; // PyTorch's int64 dtype code
-}
-
-// Dtype utility function needed by Metal backend.
-// Returns the size of the dtype in bytes.
-size_t aoti_torch_dtype_element_size(int32_t dtype) {
-  return dtype_to_element_size(dtype);
-}
-
-// Cleanup functions
-void cleanup_tensor_metadata() {
-  internal::tensor_to_sizes.clear();
-  internal::tensor_to_strides.clear();
-}
-
-AOTI_SHIM_EXPORT void aoti_torch_warn(
-    const char* func,
-    const char* file,
-    uint32_t line,
-    const char* msg) {
-  ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg);
-}
-
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) {
-  (void)tensor;
-  (void)ret_size;
-  throw std::runtime_error("Not implemented");
-  return Error::Internal;
-}
-
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor) {
-  (void)self;
-  (void)ret_new_tensor;
-  throw std::runtime_error("Not implemented");
-  return Error::Internal;
-}
-
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) {
-  (void)self;
-  (void)ret_new_tensor;
-  throw std::runtime_error("Not implemented");
-  return Error::Internal;
-}
-
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
-    void* data_ptr,
-    int64_t ndim,
-    const int64_t* sizes,
-    const int64_t* strides,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    Tensor** ret_new_tensor) {
-  (void)data_ptr;
-  (void)ndim;
-  (void)sizes;
-  (void)strides;
-  (void)storage_offset;
-  (void)dtype;
-  (void)device_type;
-  (void)device_index;
-  (void)ret_new_tensor;
-  throw std::runtime_error("Not implemented");
-  return Error::Internal;
-}
-
-} // extern "C"
-
-} // namespace aoti
-} // namespace backends
-} // namespace executorch
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index 3fc414fb669..dfcdecd2bc2 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -9,104 +9,343 @@
 #pragma once
 
 #include <executorch/backends/aoti/export.h>
-#include <executorch/backends/aoti/utils.h>
 #include <executorch/runtime/core/error.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <cstdint>
 #include <unordered_map>
 #include <vector>
 
+// Uses conditional compilation to separate the implementation between
+// CUDA backend (SlimTensor) and other backends like MPS (ETensor).
+// The caller determines which path is used by defining CUDA_AVAILABLE.
+#ifdef CUDA_AVAILABLE
+#include <executorch/backends/aoti/slim/core/slim_tensor.h>
+#else
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#endif
+
 namespace executorch {
 namespace backends {
 namespace aoti {
 
 // Common using declarations for ExecuTorch types
 using executorch::runtime::Error;
-using executorch::runtime::etensor::Tensor;
-
-// Global storage for tensor metadata
-extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
-extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
 
-extern "C" {
+// ============================================================
+// Tensor Type Definition - branched based on CUDA_AVAILABLE
+// ============================================================
+#ifdef CUDA_AVAILABLE
+using Tensor = executorch::backends::aoti::slim::SlimTensor;
+#else
+using Tensor = executorch::runtime::etensor::Tensor;
+#endif
 
 // Common AOTI type aliases
 using AOTIRuntimeError = Error;
 using AOTITorchError = Error;
 
-// Attribute-related operations (memory-irrelevant)
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
+#ifndef CUDA_AVAILABLE
+namespace internal {
+// Global storage for tensor metadata (ETensor path only)
+// SlimTensor stores sizes/strides directly in int64_t[] - no caching needed
+inline std::unordered_map<Tensor*, std::vector<int64_t>>& tensor_to_sizes() {
+  static std::unordered_map<Tensor*, std::vector<int64_t>> instance;
+  return instance;
+}
+inline std::unordered_map<Tensor*, std::vector<int64_t>>& tensor_to_strides() {
+  static std::unordered_map<Tensor*, std::vector<int64_t>> instance;
+  return instance;
+}
+} // namespace internal
+#endif
+
+// ============================================================
+// Basic Property Getters - Inline implementations
+// ============================================================
+
+inline AOTITorchError aoti_torch_get_data_ptr(
+    Tensor* tensor,
+    void** ret_data_ptr) {
+  if (tensor == nullptr) {
+    return Error::InvalidArgument;
+  }
+  if (ret_data_ptr == nullptr) {
+    return Error::InvalidArgument;
+  }
+
+#ifdef CUDA_AVAILABLE
+  *ret_data_ptr = tensor->data_ptr();
+#else
+  *ret_data_ptr = tensor->mutable_data_ptr();
+#endif
+  return Error::Ok;
+}
+
+inline AOTITorchError aoti_torch_get_sizes(
+    Tensor* tensor,
+    int64_t** ret_sizes) {
+  if (tensor == nullptr) {
+    return Error::InvalidArgument;
+  }
+  if (ret_sizes == nullptr) {
+    return Error::InvalidArgument;
+  }
+
+#ifdef CUDA_AVAILABLE
+  // SlimTensor stores sizes directly in int64_t[] - no caching needed
+  *ret_sizes = const_cast<int64_t*>(tensor->sizes().data());
+#else
+  auto it = internal::tensor_to_sizes().find(tensor);
+  bool needs_update = false;
+
+  if (it == internal::tensor_to_sizes().end()) {
+    needs_update = true;
+  } else {
+    // Validate cached metadata matches current tensor state
+    auto tensor_sizes = tensor->sizes();
+    needs_update = !std::equal(
+        it->second.begin(),
+        it->second.end(),
+        tensor_sizes.begin(),
+        tensor_sizes.end());
+  }
+
+  if (needs_update) {
+    std::vector<int64_t> sizes(tensor->dim());
+    auto tensor_sizes = tensor->sizes();
+    for (int i = 0; i < tensor->dim(); i++) {
+      sizes[i] = tensor_sizes[i];
+    }
+    it = internal::tensor_to_sizes()
+             .insert_or_assign(tensor, std::move(sizes))
+             .first;
+  }
+
+  // For 0D tensors, data() returns nullptr on empty vectors
+  if (it->second.empty()) {
+    static int64_t empty_sizes_placeholder = 0;
+    *ret_sizes = &empty_sizes_placeholder;
+  } else {
+    *ret_sizes = it->second.data();
+  }
+#endif
+  return Error::Ok;
+}
+
+inline AOTITorchError aoti_torch_get_strides(
+    Tensor* tensor,
+    int64_t** ret_strides) {
+  if (tensor == nullptr) {
+    return Error::InvalidArgument;
+  }
+  if (ret_strides == nullptr) {
+    return Error::InvalidArgument;
+  }
+
+#ifdef CUDA_AVAILABLE
+  // SlimTensor stores strides directly in int64_t[] - no caching needed
+  *ret_strides = const_cast<int64_t*>(tensor->strides().data());
+#else
+  auto it = internal::tensor_to_strides().find(tensor);
+  bool needs_update = false;
+
+  if (it == internal::tensor_to_strides().end()) {
+    needs_update = true;
+  } else {
+    // Validate cached metadata matches current tensor state
+    auto tensor_strides = tensor->strides();
+    needs_update = !std::equal(
+        it->second.begin(),
+        it->second.end(),
+        tensor_strides.begin(),
+        tensor_strides.end());
+  }
+
+  if (needs_update) {
+    std::vector<int64_t> strides(tensor->dim());
+    auto tensor_strides = tensor->strides();
+    for (int i = 0; i < tensor->dim(); i++) {
+      strides[i] = tensor_strides[i];
+    }
+    it = internal::tensor_to_strides()
+             .insert_or_assign(tensor, std::move(strides))
+             .first;
+  }
+
+  // For 0D tensors, data() returns nullptr on empty vectors
+  if (it->second.empty()) {
+    static int64_t empty_strides_placeholder = 0;
+    *ret_strides = &empty_strides_placeholder;
+  } else {
+    *ret_strides = it->second.data();
+  }
+#endif
+  return Error::Ok;
+}
+
+inline AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
+  if (tensor == nullptr) {
+    return Error::InvalidArgument;
+  }
+  if (ret_dtype == nullptr) {
+    return Error::InvalidArgument;
+  }
+
+#ifdef CUDA_AVAILABLE
+  *ret_dtype = static_cast<int32_t>(tensor->dtype());
+#else
+  *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
+#endif
+  return Error::Ok;
+}
+
+inline AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim) {
+  if (tensor == nullptr) {
+    return Error::InvalidArgument;
+  }
+  if (ret_dim == nullptr) {
+    return Error::InvalidArgument;
+  }
+
+  *ret_dim = static_cast<int64_t>(tensor->dim());
+  return Error::Ok;
+}
+
+// ============================================================
+// Storage & Device Property Getters - Inline implementations
+// ============================================================
+
+inline AOTITorchError aoti_torch_get_storage_offset(
+    Tensor* tensor,
+    int64_t* ret_storage_offset) {
+  if (tensor == nullptr) {
+    return Error::InvalidArgument;
+  }
+  if (ret_storage_offset == nullptr) {
+    return Error::InvalidArgument;
+  }
+
+#ifdef CUDA_AVAILABLE
+  // SlimTensor supports real storage offset
+  *ret_storage_offset = tensor->storage_offset();
+#else
+  // ETensor doesn't support storage_offset, return 0
+  *ret_storage_offset = 0;
+#endif
+  return Error::Ok;
+}
+
+inline AOTITorchError aoti_torch_get_storage_size(
+    Tensor* tensor,
+    int64_t* ret_size) {
+  if (tensor == nullptr) {
+    return Error::InvalidArgument;
+  }
+  if (ret_size == nullptr) {
+    return Error::InvalidArgument;
+  }
+
+  *ret_size = static_cast<int64_t>(tensor->nbytes());
+  return Error::Ok;
+}
+
+inline AOTITorchError aoti_torch_get_device_type(
+    Tensor* tensor,
+    int32_t* ret_device_type) {
+  if (tensor == nullptr) {
+    return Error::InvalidArgument;
+  }
+  if (ret_device_type == nullptr) {
+    return Error::InvalidArgument;
+  }
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_get_storage_offset(Tensor* tensor, int64_t* ret_storage_offset);
+#ifdef CUDA_AVAILABLE
+  // SlimTensor supports real device type
+  *ret_device_type = static_cast<int32_t>(tensor->device_type());
+#else
+  // ETensor is always CPU in default mode
+  *ret_device_type = 0; // CPU
+#endif
+  return Error::Ok;
+}
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
+inline AOTITorchError aoti_torch_get_device_index(
+    Tensor* tensor,
+    int32_t* ret_device_index) {
+  if (tensor == nullptr) {
+    return Error::InvalidArgument;
+  }
+  if (ret_device_index == nullptr) {
+    return Error::InvalidArgument;
+  }
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
+#ifdef CUDA_AVAILABLE
+  // SlimTensor supports real device index
+  *ret_device_index = static_cast<int32_t>(tensor->device_index());
+#else
+  // ETensor doesn't support multi-device, return 0
+  *ret_device_index = 0;
+#endif
+  return Error::Ok;
+}
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
+// ============================================================
+// DType Constants - These return PyTorch ScalarType enum values
+// ============================================================
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
+inline int32_t aoti_torch_dtype_float32() {
+  return 6; // ScalarType::Float
+}
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index);
+inline int32_t aoti_torch_dtype_bfloat16() {
+  return 15; // ScalarType::BFloat16
+}
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
+inline int32_t aoti_torch_dtype_int64() {
+  return 4; // ScalarType::Long
+}
 
-// Utility functions for device and layout information
-AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu();
-AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided();
-AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32();
-AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16();
-AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bool();
-AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8();
-AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16();
-AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32();
-AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64();
+inline int32_t aoti_torch_dtype_int32() {
+  return 3; // ScalarType::Int
+}
 
-// Dtype utility function needed by Metal backend
-AOTI_SHIM_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype);
+inline int32_t aoti_torch_dtype_int16() {
+  return 2; // ScalarType::Short
+}
 
-// Autograd mode functions
-AOTI_SHIM_EXPORT int32_t aoti_torch_grad_mode_is_enabled();
-AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
+inline int32_t aoti_torch_dtype_int8() {
+  return 1; // ScalarType::Char
+}
 
-// Cleanup functions for clearing global state
-AOTI_SHIM_EXPORT void cleanup_tensor_metadata();
+inline int32_t aoti_torch_dtype_bool() {
+  return 11; // ScalarType::Bool
+}
 
-AOTI_SHIM_EXPORT void aoti_torch_warn(
-    const char* func,
-    const char* file,
-    uint32_t line,
-    const char* msg);
+// ============================================================
+// Device Type Constants
+// ============================================================
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
+inline int32_t aoti_torch_device_type_cpu() {
+  return 0; // DeviceType::CPU
+}
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor);
+inline int32_t aoti_torch_device_type_cuda() {
+  return 1; // DeviceType::CUDA
+}
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor);
+// ============================================================
+// Grad Mode Functions (not supported in ExecuTorch)
+// ============================================================
 
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
-    void* data_ptr,
-    int64_t ndim,
-    const int64_t* sizes,
-    const int64_t* strides,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    Tensor** ret_new_tensor);
+inline bool aoti_torch_grad_mode_is_enabled() {
+  return false; // ExecuTorch doesn't support autograd
+}
 
-} // extern "C"
+inline AOTITorchError aoti_torch_grad_mode_set_enabled(bool enabled) {
+  if (enabled) {
+    return Error::NotSupported; // Grad mode not supported in ExecuTorch
+  }
+  return Error::Ok;
+}
 
 } // namespace aoti
 } // namespace backends
diff --git a/backends/aoti/common_shims_slim.h b/backends/aoti/common_shims_slim.h
deleted file mode 100644
index dfcdecd2bc2..00000000000
--- a/backends/aoti/common_shims_slim.h
+++ /dev/null
@@ -1,352 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/aoti/export.h>
-#include <executorch/runtime/core/error.h>
-#include <cstdint>
-#include <unordered_map>
-#include <vector>
-
-// Uses conditional compilation to separate the implementation between
-// CUDA backend (SlimTensor) and other backends like MPS (ETensor).
-// The caller determines which path is used by defining CUDA_AVAILABLE.
-#ifdef CUDA_AVAILABLE
-#include <executorch/backends/aoti/slim/core/slim_tensor.h>
-#else
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#endif
-
-namespace executorch {
-namespace backends {
-namespace aoti {
-
-// Common using declarations for ExecuTorch types
-using executorch::runtime::Error;
-
-// ============================================================
-// Tensor Type Definition - branched based on CUDA_AVAILABLE
-// ============================================================
-#ifdef CUDA_AVAILABLE
-using Tensor = executorch::backends::aoti::slim::SlimTensor;
-#else
-using Tensor = executorch::runtime::etensor::Tensor;
-#endif
-
-// Common AOTI type aliases
-using AOTIRuntimeError = Error;
-using AOTITorchError = Error;
-
-#ifndef CUDA_AVAILABLE
-namespace internal {
-// Global storage for tensor metadata (ETensor path only)
-// SlimTensor stores sizes/strides directly in int64_t[] - no caching needed
-inline std::unordered_map<Tensor*, std::vector<int64_t>>& tensor_to_sizes() {
-  static std::unordered_map<Tensor*, std::vector<int64_t>> instance;
-  return instance;
-}
-inline std::unordered_map<Tensor*, std::vector<int64_t>>& tensor_to_strides() {
-  static std::unordered_map<Tensor*, std::vector<int64_t>> instance;
-  return instance;
-}
-} // namespace internal
-#endif
-
-// ============================================================
-// Basic Property Getters - Inline implementations
-// ============================================================
-
-inline AOTITorchError aoti_torch_get_data_ptr(
-    Tensor* tensor,
-    void** ret_data_ptr) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_data_ptr == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-#ifdef CUDA_AVAILABLE
-  *ret_data_ptr = tensor->data_ptr();
-#else
-  *ret_data_ptr = tensor->mutable_data_ptr();
-#endif
-  return Error::Ok;
-}
-
-inline AOTITorchError aoti_torch_get_sizes(
-    Tensor* tensor,
-    int64_t** ret_sizes) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_sizes == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-#ifdef CUDA_AVAILABLE
-  // SlimTensor stores sizes directly in int64_t[] - no caching needed
-  *ret_sizes = const_cast<int64_t*>(tensor->sizes().data());
-#else
-  auto it = internal::tensor_to_sizes().find(tensor);
-  bool needs_update = false;
-
-  if (it == internal::tensor_to_sizes().end()) {
-    needs_update = true;
-  } else {
-    // Validate cached metadata matches current tensor state
-    auto tensor_sizes = tensor->sizes();
-    needs_update = !std::equal(
-        it->second.begin(),
-        it->second.end(),
-        tensor_sizes.begin(),
-        tensor_sizes.end());
-  }
-
-  if (needs_update) {
-    std::vector<int64_t> sizes(tensor->dim());
-    auto tensor_sizes = tensor->sizes();
-    for (int i = 0; i < tensor->dim(); i++) {
-      sizes[i] = tensor_sizes[i];
-    }
-    it = internal::tensor_to_sizes()
-             .insert_or_assign(tensor, std::move(sizes))
-             .first;
-  }
-
-  // For 0D tensors, data() returns nullptr on empty vectors
-  if (it->second.empty()) {
-    static int64_t empty_sizes_placeholder = 0;
-    *ret_sizes = &empty_sizes_placeholder;
-  } else {
-    *ret_sizes = it->second.data();
-  }
-#endif
-  return Error::Ok;
-}
-
-inline AOTITorchError aoti_torch_get_strides(
-    Tensor* tensor,
-    int64_t** ret_strides) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_strides == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-#ifdef CUDA_AVAILABLE
-  // SlimTensor stores strides directly in int64_t[] - no caching needed
-  *ret_strides = const_cast<int64_t*>(tensor->strides().data());
-#else
-  auto it = internal::tensor_to_strides().find(tensor);
-  bool needs_update = false;
-
-  if (it == internal::tensor_to_strides().end()) {
-    needs_update = true;
-  } else {
-    // Validate cached metadata matches current tensor state
-    auto tensor_strides = tensor->strides();
-    needs_update = !std::equal(
-        it->second.begin(),
-        it->second.end(),
-        tensor_strides.begin(),
-        tensor_strides.end());
-  }
-
-  if (needs_update) {
-    std::vector<int64_t> strides(tensor->dim());
-    auto tensor_strides = tensor->strides();
-    for (int i = 0; i < tensor->dim(); i++) {
-      strides[i] = tensor_strides[i];
-    }
-    it = internal::tensor_to_strides()
-             .insert_or_assign(tensor, std::move(strides))
-             .first;
-  }
-
-  // For 0D tensors, data() returns nullptr on empty vectors
-  if (it->second.empty()) {
-    static int64_t empty_strides_placeholder = 0;
-    *ret_strides = &empty_strides_placeholder;
-  } else {
-    *ret_strides = it->second.data();
-  }
-#endif
-  return Error::Ok;
-}
-
-inline AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_dtype == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-#ifdef CUDA_AVAILABLE
-  *ret_dtype = static_cast<int32_t>(tensor->dtype());
-#else
-  *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
-#endif
-  return Error::Ok;
-}
-
-inline AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_dim == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-  *ret_dim = static_cast<int64_t>(tensor->dim());
-  return Error::Ok;
-}
-
-// ============================================================
-// Storage & Device Property Getters - Inline implementations
-// ============================================================
-
-inline AOTITorchError aoti_torch_get_storage_offset(
-    Tensor* tensor,
-    int64_t* ret_storage_offset) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_storage_offset == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-#ifdef CUDA_AVAILABLE
-  // SlimTensor supports real storage offset
-  *ret_storage_offset = tensor->storage_offset();
-#else
-  // ETensor doesn't support storage_offset, return 0
-  *ret_storage_offset = 0;
-#endif
-  return Error::Ok;
-}
-
-inline AOTITorchError aoti_torch_get_storage_size(
-    Tensor* tensor,
-    int64_t* ret_size) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_size == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-  *ret_size = static_cast<int64_t>(tensor->nbytes());
-  return Error::Ok;
-}
-
-inline AOTITorchError aoti_torch_get_device_type(
-    Tensor* tensor,
-    int32_t* ret_device_type) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_device_type == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-#ifdef CUDA_AVAILABLE
-  // SlimTensor supports real device type
-  *ret_device_type = static_cast<int32_t>(tensor->device_type());
-#else
-  // ETensor is always CPU in default mode
-  *ret_device_type = 0; // CPU
-#endif
-  return Error::Ok;
-}
-
-inline AOTITorchError aoti_torch_get_device_index(
-    Tensor* tensor,
-    int32_t* ret_device_index) {
-  if (tensor == nullptr) {
-    return Error::InvalidArgument;
-  }
-  if (ret_device_index == nullptr) {
-    return Error::InvalidArgument;
-  }
-
-#ifdef CUDA_AVAILABLE
-  // SlimTensor supports real device index
-  *ret_device_index = static_cast<int32_t>(tensor->device_index());
-#else
-  // ETensor doesn't support multi-device, return 0
-  *ret_device_index = 0;
-#endif
-  return Error::Ok;
-}
-
-// ============================================================
-// DType Constants - These return PyTorch ScalarType enum values
-// ============================================================
-
-inline int32_t aoti_torch_dtype_float32() {
-  return 6; // ScalarType::Float
-}
-
-inline int32_t aoti_torch_dtype_bfloat16() {
-  return 15; // ScalarType::BFloat16
-}
-
-inline int32_t aoti_torch_dtype_int64() {
-  return 4; // ScalarType::Long
-}
-
-inline int32_t aoti_torch_dtype_int32() {
-  return 3; // ScalarType::Int
-}
-
-inline int32_t aoti_torch_dtype_int16() {
-  return 2; // ScalarType::Short
-}
-
-inline int32_t aoti_torch_dtype_int8() {
-  return 1; // ScalarType::Char
-}
-
-inline int32_t aoti_torch_dtype_bool() {
-  return 11; // ScalarType::Bool
-}
-
-// ============================================================
-// Device Type Constants
-// ============================================================
-
-inline int32_t aoti_torch_device_type_cpu() {
-  return 0; // DeviceType::CPU
-}
-
-inline int32_t aoti_torch_device_type_cuda() {
-  return 1; // DeviceType::CUDA
-}
-
-// ============================================================
-// Grad Mode Functions (not supported in ExecuTorch)
-// ============================================================
-
-inline bool aoti_torch_grad_mode_is_enabled() {
-  return false; // ExecuTorch doesn't support autograd
-}
-
-inline AOTITorchError aoti_torch_grad_mode_set_enabled(bool enabled) {
-  if (enabled) {
-    return Error::NotSupported; // Grad mode not supported in ExecuTorch
-  }
-  return Error::Ok;
-}
-
-} // namespace aoti
-} // namespace backends
-} // namespace executorch
diff --git a/backends/aoti/slim/c10/cuda/Exception.h b/backends/aoti/slim/c10/cuda/Exception.h
index 33d8414e661..4db5781eb9f 100644
--- a/backends/aoti/slim/c10/cuda/Exception.h
+++ b/backends/aoti/slim/c10/cuda/Exception.h
@@ -19,12 +19,14 @@
 
 /// Checks a CUDA expression and aborts on error.
 /// @param EXPR The CUDA expression to check.
+#ifndef ET_CUDA_CHECK
 #define ET_CUDA_CHECK(EXPR)                                                 \
   do {                                                                      \
     const cudaError_t __err = EXPR;                                         \
     ET_CHECK_MSG(                                                           \
         __err == cudaSuccess, "CUDA error: %s", cudaGetErrorString(__err)); \
   } while (0)
+#endif
 
 /// Checks a CUDA expression and logs a warning on error (non-fatal).
 /// @param EXPR The CUDA expression to check.
diff --git a/backends/aoti/targets.bzl b/backends/aoti/targets.bzl
index 588dbc14831..ffe27e1d1e3 100644
--- a/backends/aoti/targets.bzl
+++ b/backends/aoti/targets.bzl
@@ -33,26 +33,22 @@ def define_common_targets():
         ],
     )
 
-    # AOTI common shims functionality
+    # AOTI common shims functionality (header-only library)
+    # The caller determines which tensor type is used by defining CUDA_AVAILABLE.
+    # - With CUDA_AVAILABLE=1: Uses SlimTensor
+    # - Without CUDA_AVAILABLE: Uses ETensor
     runtime.cxx_library(
         name = "common_shims",
-        srcs = [
-            "common_shims.cpp",
-        ],
         headers = [
             "common_shims.h",
             "export.h",
             "utils.h",
         ],
-        # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
-        link_whole = True,
-        supports_python_dlopen = True,
-        # Constructor needed for backend registration.
-        compiler_flags = ["-Wno-global-constructors"],
         visibility = ["PUBLIC"],
-        deps = [
+        exported_deps = [
             "//executorch/runtime/core:core",
             "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/backends/aoti/slim/core:slimtensor",
         ],
     )
 
@@ -86,21 +82,3 @@ def define_common_targets():
             ":delegate_handle",
         ],
     )
-
-    # SlimTensor-based common shims (header-only library)
-    # The caller determines which tensor type is used by defining CUDA_AVAILABLE.
-    # - With CUDA_AVAILABLE=1: Uses SlimTensor
-    # - Without CUDA_AVAILABLE: Uses ETensor
-    runtime.cxx_library(
-        name = "common_shims_slim",
-        headers = [
-            "common_shims_slim.h",
-            "export.h",
-        ],
-        visibility = ["@EXECUTORCH_CLIENTS"],
-        deps = [
-            "//executorch/runtime/core:core",
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/backends/aoti/slim/core:slimtensor",
-        ],
-    )
diff --git a/backends/aoti/tests/TARGETS b/backends/aoti/tests/TARGETS
index d92e0e32a1f..8b3e8a7f4b1 100644
--- a/backends/aoti/tests/TARGETS
+++ b/backends/aoti/tests/TARGETS
@@ -8,27 +8,8 @@ cpp_unittest(
     srcs = [
         "test_common_shims.cpp",
     ],
-    headers = [
-        "utils.h",
-    ],
     deps = [
         "//executorch/backends/aoti:common_shims",
-        "//executorch/extension/tensor:tensor",
-        "//executorch/runtime/core:core",
-        "//executorch/runtime/platform:platform",
-        "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
-        "//executorch/runtime/core/exec_aten:lib",
-        "//executorch/extension/tensor:tensor",
-    ],
-)
-
-cpp_unittest(
-    name = "test_common_shims_slim",
-    srcs = [
-        "test_common_shims_slim.cpp",
-    ],
-    deps = [
-        "//executorch/backends/aoti:common_shims_slim",
         "//executorch/backends/aoti/slim/core:slimtensor",
         "//executorch/backends/aoti/slim/factory:empty",
         "//executorch/runtime/core:core",
diff --git a/backends/aoti/tests/test_common_shims.cpp b/backends/aoti/tests/test_common_shims.cpp
index 0fd1b057f99..3bc76e522cf 100644
--- a/backends/aoti/tests/test_common_shims.cpp
+++ b/backends/aoti/tests/test_common_shims.cpp
@@ -6,330 +6,627 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/aoti/tests/utils.h>
-#include <executorch/runtime/core/error.h>
 #include <gtest/gtest.h>
-#include <memory>
 #include <vector>
 
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
+#include <executorch/backends/aoti/slim/factory/empty.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+
+#ifdef CUDA_AVAILABLE
+#include <cuda_runtime.h>
+#endif
+
 using namespace executorch::backends::aoti;
-using namespace executorch::backends::aoti::test;
-using namespace executorch::runtime;
-using executorch::runtime::etensor::Tensor;
+using executorch::runtime::Error;
+
+namespace slim_c10 = executorch::backends::aoti::slim::c10;
+namespace slim = executorch::backends::aoti::slim;
+
+namespace {
+
+#ifdef CUDA_AVAILABLE
+bool isCudaAvailable() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  return (err == cudaSuccess && device_count > 0);
+}
+#endif
+
+// Helper to calculate contiguous strides from sizes
+std::vector<int64_t> calculateContiguousStrides(
+    const std::vector<int64_t>& sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  if (sizes.empty()) {
+    return strides;
+  }
+  strides[sizes.size() - 1] = 1;
+  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * sizes[i + 1];
+  }
+  return strides;
+}
+
+} // namespace
 
-// Test fixture for common shims tests
-class CommonShimsTest : public ::testing::Test {
+// Test fixture for common_shims_slim tests
+class CommonShimsSlimTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    // Clean up any existing cached metadata before each test
-    cleanup_tensor_metadata();
+    et_pal_init();
   }
 
   void TearDown() override {
-    // Clean up metadata and free any tensor data
-    cleanup_tensor_metadata();
-    for (auto& tensor : test_tensors_) {
-      free_tensor_data(tensor.get());
+    // Cleanup tracked tensors
+    for (Tensor* t : tensors_) {
+      delete t;
     }
-    test_tensors_.clear();
+    tensors_.clear();
   }
 
-  // Helper to create and track test tensors for cleanup
-  Tensor* create_tracked_tensor(const std::vector<int64_t>& sizes) {
-    auto tensor = create_test_tensor(sizes);
-    Tensor* ptr = tensor.get();
-    test_tensors_.push_back(tensor);
-    return ptr;
+  void trackTensor(Tensor* t) {
+    if (t != nullptr) {
+      tensors_.push_back(t);
+    }
+  }
+
+  Tensor* createTestTensor(
+      const std::vector<int64_t>& sizes,
+      slim_c10::DeviceType device_type) {
+    std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+    slim_c10::Device device(device_type, 0);
+    Tensor* tensor = new Tensor(slim::empty_strided(
+        slim::makeArrayRef(sizes),
+        slim::makeArrayRef(strides),
+        slim_c10::ScalarType::Float,
+        device));
+    trackTensor(tensor);
+    return tensor;
   }
 
  private:
-  std::vector<std::shared_ptr<Tensor>> test_tensors_;
+  std::vector<Tensor*> tensors_;
 };
 
-// Test aoti_torch_get_sizes basic functionality
-TEST_F(CommonShimsTest, GetSizesBasicFunctionality) {
-  // Test 1D tensor
-  auto tensor_1d = create_tracked_tensor({5});
-  int64_t* sizes_ptr;
-  AOTITorchError error = aoti_torch_get_sizes(tensor_1d, &sizes_ptr);
+// ============================================================================
+// Common test body implementations - parameterized by device type
+// ============================================================================
+
+void runGetDataPtrTest(slim_c10::DeviceType device_type) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+  slim_c10::Device device(device_type, 0);
+
+  Tensor* tensor = new Tensor(slim::empty_strided(
+      slim::makeArrayRef(sizes),
+      slim::makeArrayRef(strides),
+      slim_c10::ScalarType::Float,
+      device));
+
+  void* data_ptr = nullptr;
+  AOTITorchError error = aoti_torch_get_data_ptr(tensor, &data_ptr);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(sizes_ptr, nullptr);
-  EXPECT_EQ(sizes_ptr[0], 5);
+  EXPECT_NE(data_ptr, nullptr);
 
-  // Test 2D tensor
-  auto tensor_2d = create_tracked_tensor({3, 4});
-  error = aoti_torch_get_sizes(tensor_2d, &sizes_ptr);
+  // Verify the returned pointer matches tensor's data_ptr
+  EXPECT_EQ(data_ptr, tensor->data_ptr());
+
+  delete tensor;
+}
+
+void runGetSizesTest(slim_c10::DeviceType device_type) {
+  std::vector<int64_t> sizes = {2, 3, 4};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+  slim_c10::Device device(device_type, 0);
+
+  Tensor* tensor = new Tensor(slim::empty_strided(
+      slim::makeArrayRef(sizes),
+      slim::makeArrayRef(strides),
+      slim_c10::ScalarType::Float,
+      device));
+
+  int64_t* ret_sizes = nullptr;
+  AOTITorchError error = aoti_torch_get_sizes(tensor, &ret_sizes);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(sizes_ptr, nullptr);
-  EXPECT_EQ(sizes_ptr[0], 3);
-  EXPECT_EQ(sizes_ptr[1], 4);
+  EXPECT_NE(ret_sizes, nullptr);
 
-  // Test 3D tensor
-  auto tensor_3d = create_tracked_tensor({2, 3, 4});
-  error = aoti_torch_get_sizes(tensor_3d, &sizes_ptr);
+  // Verify sizes match
+  EXPECT_EQ(ret_sizes[0], 2);
+  EXPECT_EQ(ret_sizes[1], 3);
+  EXPECT_EQ(ret_sizes[2], 4);
+
+  delete tensor;
+}
+
+void runGetStridesTest(slim_c10::DeviceType device_type) {
+  std::vector<int64_t> sizes = {2, 3, 4};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+  slim_c10::Device device(device_type, 0);
+
+  Tensor* tensor = new Tensor(slim::empty_strided(
+      slim::makeArrayRef(sizes),
+      slim::makeArrayRef(strides),
+      slim_c10::ScalarType::Float,
+      device));
+
+  int64_t* ret_strides = nullptr;
+  AOTITorchError error = aoti_torch_get_strides(tensor, &ret_strides);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(sizes_ptr, nullptr);
-  EXPECT_EQ(sizes_ptr[0], 2);
-  EXPECT_EQ(sizes_ptr[1], 3);
-  EXPECT_EQ(sizes_ptr[2], 4);
+  EXPECT_NE(ret_strides, nullptr);
+
+  // Verify strides match: [12, 4, 1] for contiguous [2, 3, 4]
+  EXPECT_EQ(ret_strides[0], 12);
+  EXPECT_EQ(ret_strides[1], 4);
+  EXPECT_EQ(ret_strides[2], 1);
+
+  delete tensor;
+}
+
+void runGetDtypeTest(slim_c10::DeviceType device_type) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+  slim_c10::Device device(device_type, 0);
+
+  // Test Float32
+  {
+    Tensor* tensor = new Tensor(slim::empty_strided(
+        slim::makeArrayRef(sizes),
+        slim::makeArrayRef(strides),
+        slim_c10::ScalarType::Float,
+        device));
+
+    int32_t ret_dtype = -1;
+    AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype);
+
+    EXPECT_EQ(error, Error::Ok);
+    EXPECT_EQ(ret_dtype, static_cast<int32_t>(slim_c10::ScalarType::Float));
+
+    delete tensor;
+  }
+
+  // Test Int64
+  {
+    Tensor* tensor = new Tensor(slim::empty_strided(
+        slim::makeArrayRef(sizes),
+        slim::makeArrayRef(strides),
+        slim_c10::ScalarType::Long,
+        device));
+
+    int32_t ret_dtype = -1;
+    AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype);
+
+    EXPECT_EQ(error, Error::Ok);
+    EXPECT_EQ(ret_dtype, static_cast<int32_t>(slim_c10::ScalarType::Long));
+
+    delete tensor;
+  }
+
+  // Test BFloat16
+  {
+    Tensor* tensor = new Tensor(slim::empty_strided(
+        slim::makeArrayRef(sizes),
+        slim::makeArrayRef(strides),
+        slim_c10::ScalarType::BFloat16,
+        device));
+
+    int32_t ret_dtype = -1;
+    AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype);
+
+    EXPECT_EQ(error, Error::Ok);
+    EXPECT_EQ(ret_dtype, static_cast<int32_t>(slim_c10::ScalarType::BFloat16));
+
+    delete tensor;
+  }
 }
 
-// Test aoti_torch_get_strides basic functionality
-TEST_F(CommonShimsTest, GetStridesBasicFunctionality) {
+void runGetDimTest(slim_c10::DeviceType device_type) {
+  slim_c10::Device device(device_type, 0);
+
+  // Test 0D tensor (scalar)
+  {
+    std::vector<int64_t> sizes = {};
+    std::vector<int64_t> strides = {};
+
+    Tensor* tensor = new Tensor(slim::empty_strided(
+        slim::makeArrayRef(sizes),
+        slim::makeArrayRef(strides),
+        slim_c10::ScalarType::Float,
+        device));
+
+    int64_t ret_dim = -1;
+    AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim);
+
+    EXPECT_EQ(error, Error::Ok);
+    EXPECT_EQ(ret_dim, 0);
+
+    delete tensor;
+  }
+
   // Test 1D tensor
-  auto tensor_1d = create_tracked_tensor({5});
-  int64_t* strides_ptr;
-  AOTITorchError error = aoti_torch_get_strides(tensor_1d, &strides_ptr);
+  {
+    std::vector<int64_t> sizes = {5};
+    std::vector<int64_t> strides = calculateContiguousStrides(sizes);
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(strides_ptr, nullptr);
-  EXPECT_EQ(strides_ptr[0], 1);
+    Tensor* tensor = new Tensor(slim::empty_strided(
+        slim::makeArrayRef(sizes),
+        slim::makeArrayRef(strides),
+        slim_c10::ScalarType::Float,
+        device));
 
-  // Test 2D tensor - row major: [3, 4] should have strides [4, 1]
-  auto tensor_2d = create_tracked_tensor({3, 4});
-  error = aoti_torch_get_strides(tensor_2d, &strides_ptr);
+    int64_t ret_dim = -1;
+    AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim);
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(strides_ptr, nullptr);
-  EXPECT_EQ(strides_ptr[0], 4);
-  EXPECT_EQ(strides_ptr[1], 1);
+    EXPECT_EQ(error, Error::Ok);
+    EXPECT_EQ(ret_dim, 1);
 
-  // Test 3D tensor - row major: [2, 3, 4] should have strides [12, 4, 1]
-  auto tensor_3d = create_tracked_tensor({2, 3, 4});
-  error = aoti_torch_get_strides(tensor_3d, &strides_ptr);
+    delete tensor;
+  }
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(strides_ptr, nullptr);
-  EXPECT_EQ(strides_ptr[0], 12);
-  EXPECT_EQ(strides_ptr[1], 4);
-  EXPECT_EQ(strides_ptr[2], 1);
+  // Test 3D tensor
+  {
+    std::vector<int64_t> sizes = {2, 3, 4};
+    std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+    Tensor* tensor = new Tensor(slim::empty_strided(
+        slim::makeArrayRef(sizes),
+        slim::makeArrayRef(strides),
+        slim_c10::ScalarType::Float,
+        device));
+
+    int64_t ret_dim = -1;
+    AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim);
+
+    EXPECT_EQ(error, Error::Ok);
+    EXPECT_EQ(ret_dim, 3);
+
+    delete tensor;
+  }
 }
 
-// Test caching logic for sizes
-TEST_F(CommonShimsTest, SizesCachingLogic) {
-  auto tensor = create_tracked_tensor({2, 3, 4});
+// ============================================================================
+// Storage & Device Property Tests
+// ============================================================================
+
+void runGetStorageOffsetTest(slim_c10::DeviceType device_type) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+  slim_c10::Device device(device_type, 0);
+
+  Tensor* tensor = new Tensor(slim::empty_strided(
+      slim::makeArrayRef(sizes),
+      slim::makeArrayRef(strides),
+      slim_c10::ScalarType::Float,
+      device));
+
+  int64_t ret_storage_offset = -1;
+  AOTITorchError error =
+      aoti_torch_get_storage_offset(tensor, &ret_storage_offset);
 
-  // First call should cache the sizes
-  int64_t* sizes_ptr1;
-  AOTITorchError error = aoti_torch_get_sizes(tensor, &sizes_ptr1);
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(sizes_ptr1, nullptr);
+  // Default storage offset for newly created tensor is 0
+  EXPECT_EQ(ret_storage_offset, 0);
+
+  delete tensor;
+}
+
+void runGetStorageSizeTest(slim_c10::DeviceType device_type) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+  slim_c10::Device device(device_type, 0);
+
+  Tensor* tensor = new Tensor(slim::empty_strided(
+      slim::makeArrayRef(sizes),
+      slim::makeArrayRef(strides),
+      slim_c10::ScalarType::Float,
+      device));
+
+  int64_t ret_size = -1;
+  AOTITorchError error = aoti_torch_get_storage_size(tensor, &ret_size);
 
-  // Second call should return the same cached pointer
-  int64_t* sizes_ptr2;
-  error = aoti_torch_get_sizes(tensor, &sizes_ptr2);
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(sizes_ptr1, sizes_ptr2); // Should be the exact same pointer
+  // 2 * 3 * sizeof(float) = 6 * 4 = 24 bytes
+  EXPECT_EQ(ret_size, 24);
 
-  // Values should still be correct
-  EXPECT_EQ(sizes_ptr2[0], 2);
-  EXPECT_EQ(sizes_ptr2[1], 3);
-  EXPECT_EQ(sizes_ptr2[2], 4);
+  delete tensor;
 }
 
-// Test caching logic for strides
-TEST_F(CommonShimsTest, StridesCachingLogic) {
-  auto tensor = create_tracked_tensor({2, 3, 4});
+void runGetDeviceTypeTest(slim_c10::DeviceType device_type) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+  slim_c10::Device device(device_type, 0);
+
+  Tensor* tensor = new Tensor(slim::empty_strided(
+      slim::makeArrayRef(sizes),
+      slim::makeArrayRef(strides),
+      slim_c10::ScalarType::Float,
+      device));
+
+  int32_t ret_device_type = -1;
+  AOTITorchError error = aoti_torch_get_device_type(tensor, &ret_device_type);
 
-  // First call should cache the strides
-  int64_t* strides_ptr1;
-  AOTITorchError error = aoti_torch_get_strides(tensor, &strides_ptr1);
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(strides_ptr1, nullptr);
+  EXPECT_EQ(ret_device_type, static_cast<int32_t>(device_type));
+
+  delete tensor;
+}
+
+void runGetDeviceIndexTest(slim_c10::DeviceType device_type) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+  slim_c10::Device device(device_type, 0);
+
+  Tensor* tensor = new Tensor(slim::empty_strided(
+      slim::makeArrayRef(sizes),
+      slim::makeArrayRef(strides),
+      slim_c10::ScalarType::Float,
+      device));
+
+  int32_t ret_device_index = -1;
+  AOTITorchError error = aoti_torch_get_device_index(tensor, &ret_device_index);
 
-  // Second call should return the same cached pointer
-  int64_t* strides_ptr2;
-  error = aoti_torch_get_strides(tensor, &strides_ptr2);
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(strides_ptr1, strides_ptr2); // Should be the exact same pointer
+  EXPECT_EQ(ret_device_index, 0);
 
-  // Values should still be correct
-  EXPECT_EQ(strides_ptr2[0], 12);
-  EXPECT_EQ(strides_ptr2[1], 4);
-  EXPECT_EQ(strides_ptr2[2], 1);
+  delete tensor;
 }
 
-// Test that different tensors have different cached entries
-TEST_F(CommonShimsTest, DifferentTensorsCacheSeparately) {
-  auto tensor1 = create_tracked_tensor({2, 3});
-  auto tensor2 = create_tracked_tensor({4, 5});
+// ============================================================================
+// CPU Tests
+// ============================================================================
 
-  // Get sizes for both tensors
-  int64_t* sizes1_ptr;
-  int64_t* sizes2_ptr;
+TEST_F(CommonShimsSlimTest, GetDataPtr_CPU) {
+  runGetDataPtrTest(slim_c10::DeviceType::CPU);
+}
 
-  EXPECT_EQ(aoti_torch_get_sizes(tensor1, &sizes1_ptr), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_sizes(tensor2, &sizes2_ptr), Error::Ok);
+TEST_F(CommonShimsSlimTest, GetSizes_CPU) {
+  runGetSizesTest(slim_c10::DeviceType::CPU);
+}
 
-  // Pointers should be different (different cache entries)
-  EXPECT_NE(sizes1_ptr, sizes2_ptr);
+TEST_F(CommonShimsSlimTest, GetStrides_CPU) {
+  runGetStridesTest(slim_c10::DeviceType::CPU);
+}
 
-  // Values should be correct
-  EXPECT_EQ(sizes1_ptr[0], 2);
-  EXPECT_EQ(sizes1_ptr[1], 3);
-  EXPECT_EQ(sizes2_ptr[0], 4);
-  EXPECT_EQ(sizes2_ptr[1], 5);
+TEST_F(CommonShimsSlimTest, GetDtype_CPU) {
+  runGetDtypeTest(slim_c10::DeviceType::CPU);
+}
 
-  // Test strides as well
-  int64_t* strides1_ptr;
-  int64_t* strides2_ptr;
+TEST_F(CommonShimsSlimTest, GetDim_CPU) {
+  runGetDimTest(slim_c10::DeviceType::CPU);
+}
 
-  EXPECT_EQ(aoti_torch_get_strides(tensor1, &strides1_ptr), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_strides(tensor2, &strides2_ptr), Error::Ok);
+TEST_F(CommonShimsSlimTest, GetStorageOffset_CPU) {
+  runGetStorageOffsetTest(slim_c10::DeviceType::CPU);
+}
 
-  // Pointers should be different (different cache entries)
-  EXPECT_NE(strides1_ptr, strides2_ptr);
+TEST_F(CommonShimsSlimTest, GetStorageSize_CPU) {
+  runGetStorageSizeTest(slim_c10::DeviceType::CPU);
+}
 
-  // Values should be correct
-  EXPECT_EQ(strides1_ptr[0], 3);
-  EXPECT_EQ(strides1_ptr[1], 1);
-  EXPECT_EQ(strides2_ptr[0], 5);
-  EXPECT_EQ(strides2_ptr[1], 1);
+TEST_F(CommonShimsSlimTest, GetDeviceType_CPU) {
+  runGetDeviceTypeTest(slim_c10::DeviceType::CPU);
 }
 
-// Test cache persistence across multiple calls
-TEST_F(CommonShimsTest, CachePersistence) {
-  auto tensor = create_tracked_tensor({3, 4, 5});
+TEST_F(CommonShimsSlimTest, GetDeviceIndex_CPU) {
+  runGetDeviceIndexTest(slim_c10::DeviceType::CPU);
+}
 
-  // Multiple calls to sizes should all return the same pointer
-  int64_t* sizes_ptr1;
-  int64_t* sizes_ptr2;
-  int64_t* sizes_ptr3;
+// ============================================================================
+// CUDA Tests
+// ============================================================================
 
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr3), Error::Ok);
+#ifdef CUDA_AVAILABLE
+TEST_F(CommonShimsSlimTest, GetDataPtr_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runGetDataPtrTest(slim_c10::DeviceType::CUDA);
+}
 
-  EXPECT_EQ(sizes_ptr1, sizes_ptr2);
-  EXPECT_EQ(sizes_ptr2, sizes_ptr3);
+TEST_F(CommonShimsSlimTest, GetSizes_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runGetSizesTest(slim_c10::DeviceType::CUDA);
+}
 
-  // Multiple calls to strides should all return the same pointer
-  int64_t* strides_ptr1;
-  int64_t* strides_ptr2;
-  int64_t* strides_ptr3;
+TEST_F(CommonShimsSlimTest, GetStrides_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runGetStridesTest(slim_c10::DeviceType::CUDA);
+}
 
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr3), Error::Ok);
+TEST_F(CommonShimsSlimTest, GetDtype_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runGetDtypeTest(slim_c10::DeviceType::CUDA);
+}
 
-  EXPECT_EQ(strides_ptr1, strides_ptr2);
-  EXPECT_EQ(strides_ptr2, strides_ptr3);
+TEST_F(CommonShimsSlimTest, GetDim_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runGetDimTest(slim_c10::DeviceType::CUDA);
 }
 
-// Test 0D tensor (scalar)
-TEST_F(CommonShimsTest, ScalarTensor) {
-  auto tensor_0d = create_tracked_tensor({});
+TEST_F(CommonShimsSlimTest, GetStorageOffset_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runGetStorageOffsetTest(slim_c10::DeviceType::CUDA);
+}
 
-  // Test sizes for 0D tensor
-  int64_t* sizes_ptr;
-  AOTITorchError error = aoti_torch_get_sizes(tensor_0d, &sizes_ptr);
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(sizes_ptr, nullptr);
+TEST_F(CommonShimsSlimTest, GetStorageSize_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runGetStorageSizeTest(slim_c10::DeviceType::CUDA);
+}
 
-  // Test strides for 0D tensor
-  int64_t* strides_ptr;
-  error = aoti_torch_get_strides(tensor_0d, &strides_ptr);
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(strides_ptr, nullptr);
+TEST_F(CommonShimsSlimTest, GetDeviceType_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runGetDeviceTypeTest(slim_c10::DeviceType::CUDA);
+}
 
-  // Cache should work for 0D tensors too
-  int64_t* sizes_ptr2;
-  error = aoti_torch_get_sizes(tensor_0d, &sizes_ptr2);
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(sizes_ptr, sizes_ptr2);
+TEST_F(CommonShimsSlimTest, GetDeviceIndex_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runGetDeviceIndexTest(slim_c10::DeviceType::CUDA);
+}
+#endif
+
+// ============================================================================
+// Error Cases
+// ============================================================================
+
+TEST_F(CommonShimsSlimTest, NullTensorArgument) {
+  void* data_ptr = nullptr;
+  int64_t* sizes = nullptr;
+  int64_t* strides = nullptr;
+  int32_t dtype = -1;
+  int64_t dim = -1;
+
+  EXPECT_EQ(
+      aoti_torch_get_data_ptr(nullptr, &data_ptr), Error::InvalidArgument);
+  EXPECT_EQ(aoti_torch_get_sizes(nullptr, &sizes), Error::InvalidArgument);
+  EXPECT_EQ(aoti_torch_get_strides(nullptr, &strides), Error::InvalidArgument);
+  EXPECT_EQ(aoti_torch_get_dtype(nullptr, &dtype), Error::InvalidArgument);
+  EXPECT_EQ(aoti_torch_get_dim(nullptr, &dim), Error::InvalidArgument);
 }
 
-// Test large tensor dimensions
-TEST_F(CommonShimsTest, LargeTensorDimensions) {
-  auto tensor = create_tracked_tensor({100, 200, 300, 400});
+TEST_F(CommonShimsSlimTest, NullReturnPointer) {
+  Tensor* tensor = createTestTensor({2, 3}, slim_c10::DeviceType::CPU);
 
-  // Test sizes
-  int64_t* sizes_ptr;
-  AOTITorchError error = aoti_torch_get_sizes(tensor, &sizes_ptr);
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(sizes_ptr, nullptr);
-  EXPECT_EQ(sizes_ptr[0], 100);
-  EXPECT_EQ(sizes_ptr[1], 200);
-  EXPECT_EQ(sizes_ptr[2], 300);
-  EXPECT_EQ(sizes_ptr[3], 400);
-
-  // Test strides - expected: [24000000, 120000, 400, 1]
-  int64_t* strides_ptr;
-  error = aoti_torch_get_strides(tensor, &strides_ptr);
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(strides_ptr, nullptr);
-  EXPECT_EQ(strides_ptr[0], 24000000);
-  EXPECT_EQ(strides_ptr[1], 120000);
-  EXPECT_EQ(strides_ptr[2], 400);
-  EXPECT_EQ(strides_ptr[3], 1);
+  EXPECT_EQ(aoti_torch_get_data_ptr(tensor, nullptr), Error::InvalidArgument);
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, nullptr), Error::InvalidArgument);
+  EXPECT_EQ(aoti_torch_get_strides(tensor, nullptr), Error::InvalidArgument);
+  EXPECT_EQ(aoti_torch_get_dtype(tensor, nullptr), Error::InvalidArgument);
+  EXPECT_EQ(aoti_torch_get_dim(tensor, nullptr), Error::InvalidArgument);
 }
 
-// Test that cleanup_tensor_metadata clears the cache
-TEST_F(CommonShimsTest, CleanupFunctionality) {
-  auto tensor = create_tracked_tensor({2, 3});
+// ============================================================================
+// Edge Cases
+// ============================================================================
 
-  // Cache some data
-  int64_t* sizes_ptr1;
-  int64_t* strides_ptr1;
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok);
+TEST_F(CommonShimsSlimTest, ScalarTensor) {
+  std::vector<int64_t> sizes = {};
+  std::vector<int64_t> strides = {};
+  slim_c10::Device device(slim_c10::DeviceType::CPU, 0);
 
-  // Clear the cache
-  cleanup_tensor_metadata();
+  Tensor* tensor = new Tensor(slim::empty_strided(
+      slim::makeArrayRef(sizes),
+      slim::makeArrayRef(strides),
+      slim_c10::ScalarType::Float,
+      device));
+  trackTensor(tensor);
 
-  // Getting sizes/strides again should create new cache entries
-  // (We can't directly test if the pointers are different since that would be
-  // implementation-dependent, but we can at least verify the functions still
-  // work)
-  int64_t* sizes_ptr2;
-  int64_t* strides_ptr2;
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok);
+  // Get sizes and strides for 0D tensor
+  int64_t* ret_sizes = nullptr;
+  int64_t* ret_strides = nullptr;
+  int64_t ret_dim = -1;
+
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, &ret_sizes), Error::Ok);
+  EXPECT_NE(ret_sizes, nullptr);
+
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &ret_strides), Error::Ok);
+  EXPECT_NE(ret_strides, nullptr);
 
-  // Values should still be correct
-  EXPECT_EQ(sizes_ptr2[0], 2);
-  EXPECT_EQ(sizes_ptr2[1], 3);
-  EXPECT_EQ(strides_ptr2[0], 3);
-  EXPECT_EQ(strides_ptr2[1], 1);
+  EXPECT_EQ(aoti_torch_get_dim(tensor, &ret_dim), Error::Ok);
+  EXPECT_EQ(ret_dim, 0);
 }
 
-// Test mixed operations to ensure caches are independent
-TEST_F(CommonShimsTest, IndependentCaches) {
-  auto tensor = create_tracked_tensor({2, 3, 4});
+TEST_F(CommonShimsSlimTest, LargeTensor) {
+  std::vector<int64_t> sizes = {100, 200, 300};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+  slim_c10::Device device(slim_c10::DeviceType::CPU, 0);
+
+  Tensor* tensor = new Tensor(slim::empty_strided(
+      slim::makeArrayRef(sizes),
+      slim::makeArrayRef(strides),
+      slim_c10::ScalarType::Float,
+      device));
+  trackTensor(tensor);
+
+  int64_t* ret_sizes = nullptr;
+  int64_t* ret_strides = nullptr;
+
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, &ret_sizes), Error::Ok);
+  EXPECT_EQ(ret_sizes[0], 100);
+  EXPECT_EQ(ret_sizes[1], 200);
+  EXPECT_EQ(ret_sizes[2], 300);
+
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &ret_strides), Error::Ok);
+  EXPECT_EQ(ret_strides[0], 60000); // 200 * 300
+  EXPECT_EQ(ret_strides[1], 300); // 300
+  EXPECT_EQ(ret_strides[2], 1);
+}
 
-  // Get sizes first
-  int64_t* sizes_ptr1;
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok);
+TEST_F(CommonShimsSlimTest, ConsistentPointerReturn) {
+  Tensor* tensor = createTestTensor({2, 3, 4}, slim_c10::DeviceType::CPU);
 
-  // Get strides
-  int64_t* strides_ptr1;
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok);
+  // Multiple calls should return the same pointer (for SlimTensor)
+  int64_t* sizes_ptr1 = nullptr;
+  int64_t* sizes_ptr2 = nullptr;
 
-  // Get sizes again - should be cached
-  int64_t* sizes_ptr2;
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok);
   EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok);
   EXPECT_EQ(sizes_ptr1, sizes_ptr2);
 
-  // Get strides again - should be cached
-  int64_t* strides_ptr2;
+  int64_t* strides_ptr1 = nullptr;
+  int64_t* strides_ptr2 = nullptr;
+
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok);
   EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok);
   EXPECT_EQ(strides_ptr1, strides_ptr2);
+}
+
+// ============================================================================
+// DType Constants Tests
+// ============================================================================
+
+TEST_F(CommonShimsSlimTest, DTypeConstants) {
+  // Verify dtype constants match expected PyTorch ScalarType values
+  EXPECT_EQ(aoti_torch_dtype_float32(), 6); // ScalarType::Float
+  EXPECT_EQ(aoti_torch_dtype_bfloat16(), 15); // ScalarType::BFloat16
+  EXPECT_EQ(aoti_torch_dtype_int64(), 4); // ScalarType::Long
+  EXPECT_EQ(aoti_torch_dtype_int32(), 3); // ScalarType::Int
+  EXPECT_EQ(aoti_torch_dtype_int16(), 2); // ScalarType::Short
+  EXPECT_EQ(aoti_torch_dtype_int8(), 1); // ScalarType::Char
+  EXPECT_EQ(aoti_torch_dtype_bool(), 11); // ScalarType::Bool
+}
+
+// ============================================================================
+// Device Type Constants Tests
+// ============================================================================
 
-  // Sizes and strides pointers should be different (different caches)
-  EXPECT_NE(sizes_ptr1, strides_ptr1);
+TEST_F(CommonShimsSlimTest, DeviceTypeConstants) {
+  EXPECT_EQ(aoti_torch_device_type_cpu(), 0); // DeviceType::CPU
+  EXPECT_EQ(aoti_torch_device_type_cuda(), 1); // DeviceType::CUDA
 }
 
-// Test all dtype functions return correct PyTorch dtype codes
-TEST_F(CommonShimsTest, AllDtypesReturnCorrectValues) {
-  EXPECT_EQ(aoti_torch_dtype_float32(), 6); // PyTorch's float32 dtype code
-  EXPECT_EQ(aoti_torch_dtype_bfloat16(), 15); // PyTorch's bfloat16 dtype code
-  EXPECT_EQ(aoti_torch_dtype_int8(), 1); // PyTorch's int8 dtype code
-  EXPECT_EQ(aoti_torch_dtype_int16(), 2); // PyTorch's int16 dtype code
-  EXPECT_EQ(aoti_torch_dtype_int32(), 3); // PyTorch's int32 dtype code
-  EXPECT_EQ(aoti_torch_dtype_int64(), 4); // PyTorch's int64 dtype code
-  EXPECT_EQ(aoti_torch_dtype_bool(), 11); // PyTorch's bool dtype code
+// ============================================================================
+// Grad Mode Tests
+// ============================================================================
+
+TEST_F(CommonShimsSlimTest, GradModeIsEnabled) {
+  // ExecuTorch doesn't support autograd, so should always return false
+  EXPECT_EQ(aoti_torch_grad_mode_is_enabled(), false);
+}
+
+TEST_F(CommonShimsSlimTest, GradModeSetEnabled) {
+  // Setting to false should succeed
+  EXPECT_EQ(aoti_torch_grad_mode_set_enabled(false), Error::Ok);
+
+  // Setting to true should fail (not supported in ExecuTorch)
+  EXPECT_EQ(aoti_torch_grad_mode_set_enabled(true), Error::NotSupported);
 }
diff --git a/backends/aoti/tests/test_common_shims_slim.cpp b/backends/aoti/tests/test_common_shims_slim.cpp
deleted file mode 100644
index 94319c6f94d..00000000000
--- a/backends/aoti/tests/test_common_shims_slim.cpp
+++ /dev/null
@@ -1,632 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-#include <vector>
-
-#include <executorch/backends/aoti/common_shims_slim.h>
-#include <executorch/backends/aoti/slim/c10/core/Device.h>
-#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
-#include <executorch/backends/aoti/slim/factory/empty.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
-
-#ifdef CUDA_AVAILABLE
-#include <cuda_runtime.h>
-#endif
-
-using namespace executorch::backends::aoti;
-using executorch::runtime::Error;
-
-namespace slim_c10 = executorch::backends::aoti::slim::c10;
-namespace slim = executorch::backends::aoti::slim;
-
-namespace {
-
-#ifdef CUDA_AVAILABLE
-bool isCudaAvailable() {
-  int device_count = 0;
-  cudaError_t err = cudaGetDeviceCount(&device_count);
-  return (err == cudaSuccess && device_count > 0);
-}
-#endif
-
-// Helper to calculate contiguous strides from sizes
-std::vector<int64_t> calculateContiguousStrides(
-    const std::vector<int64_t>& sizes) {
-  std::vector<int64_t> strides(sizes.size());
-  if (sizes.empty()) {
-    return strides;
-  }
-  strides[sizes.size() - 1] = 1;
-  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-    strides[i] = strides[i + 1] * sizes[i + 1];
-  }
-  return strides;
-}
-
-} // namespace
-
-// Test fixture for common_shims_slim tests
-class CommonShimsSlimTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    et_pal_init();
-  }
-
-  void TearDown() override {
-    // Cleanup tracked tensors
-    for (Tensor* t : tensors_) {
-      delete t;
-    }
-    tensors_.clear();
-  }
-
-  void trackTensor(Tensor* t) {
-    if (t != nullptr) {
-      tensors_.push_back(t);
-    }
-  }
-
-  Tensor* createTestTensor(
-      const std::vector<int64_t>& sizes,
-      slim_c10::DeviceType device_type) {
-    std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-    slim_c10::Device device(device_type, 0);
-    Tensor* tensor = new Tensor(slim::empty_strided(
-        slim::makeArrayRef(sizes),
-        slim::makeArrayRef(strides),
-        slim_c10::ScalarType::Float,
-        device));
-    trackTensor(tensor);
-    return tensor;
-  }
-
- private:
-  std::vector<Tensor*> tensors_;
-};
-
-// ============================================================================
-// Common test body implementations - parameterized by device type
-// ============================================================================
-
-void runGetDataPtrTest(slim_c10::DeviceType device_type) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-  slim_c10::Device device(device_type, 0);
-
-  Tensor* tensor = new Tensor(slim::empty_strided(
-      slim::makeArrayRef(sizes),
-      slim::makeArrayRef(strides),
-      slim_c10::ScalarType::Float,
-      device));
-
-  void* data_ptr = nullptr;
-  AOTITorchError error = aoti_torch_get_data_ptr(tensor, &data_ptr);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(data_ptr, nullptr);
-
-  // Verify the returned pointer matches tensor's data_ptr
-  EXPECT_EQ(data_ptr, tensor->data_ptr());
-
-  delete tensor;
-}
-
-void runGetSizesTest(slim_c10::DeviceType device_type) {
-  std::vector<int64_t> sizes = {2, 3, 4};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-  slim_c10::Device device(device_type, 0);
-
-  Tensor* tensor = new Tensor(slim::empty_strided(
-      slim::makeArrayRef(sizes),
-      slim::makeArrayRef(strides),
-      slim_c10::ScalarType::Float,
-      device));
-
-  int64_t* ret_sizes = nullptr;
-  AOTITorchError error = aoti_torch_get_sizes(tensor, &ret_sizes);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(ret_sizes, nullptr);
-
-  // Verify sizes match
-  EXPECT_EQ(ret_sizes[0], 2);
-  EXPECT_EQ(ret_sizes[1], 3);
-  EXPECT_EQ(ret_sizes[2], 4);
-
-  delete tensor;
-}
-
-void runGetStridesTest(slim_c10::DeviceType device_type) {
-  std::vector<int64_t> sizes = {2, 3, 4};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-  slim_c10::Device device(device_type, 0);
-
-  Tensor* tensor = new Tensor(slim::empty_strided(
-      slim::makeArrayRef(sizes),
-      slim::makeArrayRef(strides),
-      slim_c10::ScalarType::Float,
-      device));
-
-  int64_t* ret_strides = nullptr;
-  AOTITorchError error = aoti_torch_get_strides(tensor, &ret_strides);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(ret_strides, nullptr);
-
-  // Verify strides match: [12, 4, 1] for contiguous [2, 3, 4]
-  EXPECT_EQ(ret_strides[0], 12);
-  EXPECT_EQ(ret_strides[1], 4);
-  EXPECT_EQ(ret_strides[2], 1);
-
-  delete tensor;
-}
-
-void runGetDtypeTest(slim_c10::DeviceType device_type) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-  slim_c10::Device device(device_type, 0);
-
-  // Test Float32
-  {
-    Tensor* tensor = new Tensor(slim::empty_strided(
-        slim::makeArrayRef(sizes),
-        slim::makeArrayRef(strides),
-        slim_c10::ScalarType::Float,
-        device));
-
-    int32_t ret_dtype = -1;
-    AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype);
-
-    EXPECT_EQ(error, Error::Ok);
-    EXPECT_EQ(ret_dtype, static_cast<int32_t>(slim_c10::ScalarType::Float));
-
-    delete tensor;
-  }
-
-  // Test Int64
-  {
-    Tensor* tensor = new Tensor(slim::empty_strided(
-        slim::makeArrayRef(sizes),
-        slim::makeArrayRef(strides),
-        slim_c10::ScalarType::Long,
-        device));
-
-    int32_t ret_dtype = -1;
-    AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype);
-
-    EXPECT_EQ(error, Error::Ok);
-    EXPECT_EQ(ret_dtype, static_cast<int32_t>(slim_c10::ScalarType::Long));
-
-    delete tensor;
-  }
-
-  // Test BFloat16
-  {
-    Tensor* tensor = new Tensor(slim::empty_strided(
-        slim::makeArrayRef(sizes),
-        slim::makeArrayRef(strides),
-        slim_c10::ScalarType::BFloat16,
-        device));
-
-    int32_t ret_dtype = -1;
-    AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype);
-
-    EXPECT_EQ(error, Error::Ok);
-    EXPECT_EQ(ret_dtype, static_cast<int32_t>(slim_c10::ScalarType::BFloat16));
-
-    delete tensor;
-  }
-}
-
-void runGetDimTest(slim_c10::DeviceType device_type) {
-  slim_c10::Device device(device_type, 0);
-
-  // Test 0D tensor (scalar)
-  {
-    std::vector<int64_t> sizes = {};
-    std::vector<int64_t> strides = {};
-
-    Tensor* tensor = new Tensor(slim::empty_strided(
-        slim::makeArrayRef(sizes),
-        slim::makeArrayRef(strides),
-        slim_c10::ScalarType::Float,
-        device));
-
-    int64_t ret_dim = -1;
-    AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim);
-
-    EXPECT_EQ(error, Error::Ok);
-    EXPECT_EQ(ret_dim, 0);
-
-    delete tensor;
-  }
-
-  // Test 1D tensor
-  {
-    std::vector<int64_t> sizes = {5};
-    std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-    Tensor* tensor = new Tensor(slim::empty_strided(
-        slim::makeArrayRef(sizes),
-        slim::makeArrayRef(strides),
-        slim_c10::ScalarType::Float,
-        device));
-
-    int64_t ret_dim = -1;
-    AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim);
-
-    EXPECT_EQ(error, Error::Ok);
-    EXPECT_EQ(ret_dim, 1);
-
-    delete tensor;
-  }
-
-  // Test 3D tensor
-  {
-    std::vector<int64_t> sizes = {2, 3, 4};
-    std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-    Tensor* tensor = new Tensor(slim::empty_strided(
-        slim::makeArrayRef(sizes),
-        slim::makeArrayRef(strides),
-        slim_c10::ScalarType::Float,
-        device));
-
-    int64_t ret_dim = -1;
-    AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim);
-
-    EXPECT_EQ(error, Error::Ok);
-    EXPECT_EQ(ret_dim, 3);
-
-    delete tensor;
-  }
-}
-
-// ============================================================================
-// Storage & Device Property Tests
-// ============================================================================
-
-void runGetStorageOffsetTest(slim_c10::DeviceType device_type) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-  slim_c10::Device device(device_type, 0);
-
-  Tensor* tensor = new Tensor(slim::empty_strided(
-      slim::makeArrayRef(sizes),
-      slim::makeArrayRef(strides),
-      slim_c10::ScalarType::Float,
-      device));
-
-  int64_t ret_storage_offset = -1;
-  AOTITorchError error =
-      aoti_torch_get_storage_offset(tensor, &ret_storage_offset);
-
-  EXPECT_EQ(error, Error::Ok);
-  // Default storage offset for newly created tensor is 0
-  EXPECT_EQ(ret_storage_offset, 0);
-
-  delete tensor;
-}
-
-void runGetStorageSizeTest(slim_c10::DeviceType device_type) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-  slim_c10::Device device(device_type, 0);
-
-  Tensor* tensor = new Tensor(slim::empty_strided(
-      slim::makeArrayRef(sizes),
-      slim::makeArrayRef(strides),
-      slim_c10::ScalarType::Float,
-      device));
-
-  int64_t ret_size = -1;
-  AOTITorchError error = aoti_torch_get_storage_size(tensor, &ret_size);
-
-  EXPECT_EQ(error, Error::Ok);
-  // 2 * 3 * sizeof(float) = 6 * 4 = 24 bytes
-  EXPECT_EQ(ret_size, 24);
-
-  delete tensor;
-}
-
-void runGetDeviceTypeTest(slim_c10::DeviceType device_type) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-  slim_c10::Device device(device_type, 0);
-
-  Tensor* tensor = new Tensor(slim::empty_strided(
-      slim::makeArrayRef(sizes),
-      slim::makeArrayRef(strides),
-      slim_c10::ScalarType::Float,
-      device));
-
-  int32_t ret_device_type = -1;
-  AOTITorchError error = aoti_torch_get_device_type(tensor, &ret_device_type);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(ret_device_type, static_cast<int32_t>(device_type));
-
-  delete tensor;
-}
-
-void runGetDeviceIndexTest(slim_c10::DeviceType device_type) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-  slim_c10::Device device(device_type, 0);
-
-  Tensor* tensor = new Tensor(slim::empty_strided(
-      slim::makeArrayRef(sizes),
-      slim::makeArrayRef(strides),
-      slim_c10::ScalarType::Float,
-      device));
-
-  int32_t ret_device_index = -1;
-  AOTITorchError error = aoti_torch_get_device_index(tensor, &ret_device_index);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(ret_device_index, 0);
-
-  delete tensor;
-}
-
-// ============================================================================
-// CPU Tests
-// ============================================================================
-
-TEST_F(CommonShimsSlimTest, GetDataPtr_CPU) {
-  runGetDataPtrTest(slim_c10::DeviceType::CPU);
-}
-
-TEST_F(CommonShimsSlimTest, GetSizes_CPU) {
-  runGetSizesTest(slim_c10::DeviceType::CPU);
-}
-
-TEST_F(CommonShimsSlimTest, GetStrides_CPU) {
-  runGetStridesTest(slim_c10::DeviceType::CPU);
-}
-
-TEST_F(CommonShimsSlimTest, GetDtype_CPU) {
-  runGetDtypeTest(slim_c10::DeviceType::CPU);
-}
-
-TEST_F(CommonShimsSlimTest, GetDim_CPU) {
-  runGetDimTest(slim_c10::DeviceType::CPU);
-}
-
-TEST_F(CommonShimsSlimTest, GetStorageOffset_CPU) {
-  runGetStorageOffsetTest(slim_c10::DeviceType::CPU);
-}
-
-TEST_F(CommonShimsSlimTest, GetStorageSize_CPU) {
-  runGetStorageSizeTest(slim_c10::DeviceType::CPU);
-}
-
-TEST_F(CommonShimsSlimTest, GetDeviceType_CPU) {
-  runGetDeviceTypeTest(slim_c10::DeviceType::CPU);
-}
-
-TEST_F(CommonShimsSlimTest, GetDeviceIndex_CPU) {
-  runGetDeviceIndexTest(slim_c10::DeviceType::CPU);
-}
-
-// ============================================================================
-// CUDA Tests
-// ============================================================================
-
-#ifdef CUDA_AVAILABLE
-TEST_F(CommonShimsSlimTest, GetDataPtr_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runGetDataPtrTest(slim_c10::DeviceType::CUDA);
-}
-
-TEST_F(CommonShimsSlimTest, GetSizes_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runGetSizesTest(slim_c10::DeviceType::CUDA);
-}
-
-TEST_F(CommonShimsSlimTest, GetStrides_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runGetStridesTest(slim_c10::DeviceType::CUDA);
-}
-
-TEST_F(CommonShimsSlimTest, GetDtype_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runGetDtypeTest(slim_c10::DeviceType::CUDA);
-}
-
-TEST_F(CommonShimsSlimTest, GetDim_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runGetDimTest(slim_c10::DeviceType::CUDA);
-}
-
-TEST_F(CommonShimsSlimTest, GetStorageOffset_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runGetStorageOffsetTest(slim_c10::DeviceType::CUDA);
-}
-
-TEST_F(CommonShimsSlimTest, GetStorageSize_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runGetStorageSizeTest(slim_c10::DeviceType::CUDA);
-}
-
-TEST_F(CommonShimsSlimTest, GetDeviceType_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runGetDeviceTypeTest(slim_c10::DeviceType::CUDA);
-}
-
-TEST_F(CommonShimsSlimTest, GetDeviceIndex_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runGetDeviceIndexTest(slim_c10::DeviceType::CUDA);
-}
-#endif
-
-// ============================================================================
-// Error Cases
-// ============================================================================
-
-TEST_F(CommonShimsSlimTest, NullTensorArgument) {
-  void* data_ptr = nullptr;
-  int64_t* sizes = nullptr;
-  int64_t* strides = nullptr;
-  int32_t dtype = -1;
-  int64_t dim = -1;
-
-  EXPECT_EQ(
-      aoti_torch_get_data_ptr(nullptr, &data_ptr), Error::InvalidArgument);
-  EXPECT_EQ(aoti_torch_get_sizes(nullptr, &sizes), Error::InvalidArgument);
-  EXPECT_EQ(aoti_torch_get_strides(nullptr, &strides), Error::InvalidArgument);
-  EXPECT_EQ(aoti_torch_get_dtype(nullptr, &dtype), Error::InvalidArgument);
-  EXPECT_EQ(aoti_torch_get_dim(nullptr, &dim), Error::InvalidArgument);
-}
-
-TEST_F(CommonShimsSlimTest, NullReturnPointer) {
-  Tensor* tensor = createTestTensor({2, 3}, slim_c10::DeviceType::CPU);
-
-  EXPECT_EQ(aoti_torch_get_data_ptr(tensor, nullptr), Error::InvalidArgument);
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, nullptr), Error::InvalidArgument);
-  EXPECT_EQ(aoti_torch_get_strides(tensor, nullptr), Error::InvalidArgument);
-  EXPECT_EQ(aoti_torch_get_dtype(tensor, nullptr), Error::InvalidArgument);
-  EXPECT_EQ(aoti_torch_get_dim(tensor, nullptr), Error::InvalidArgument);
-}
-
-// ============================================================================
-// Edge Cases
-// ============================================================================
-
-TEST_F(CommonShimsSlimTest, ScalarTensor) {
-  std::vector<int64_t> sizes = {};
-  std::vector<int64_t> strides = {};
-  slim_c10::Device device(slim_c10::DeviceType::CPU, 0);
-
-  Tensor* tensor = new Tensor(slim::empty_strided(
-      slim::makeArrayRef(sizes),
-      slim::makeArrayRef(strides),
-      slim_c10::ScalarType::Float,
-      device));
-  trackTensor(tensor);
-
-  // Get sizes and strides for 0D tensor
-  int64_t* ret_sizes = nullptr;
-  int64_t* ret_strides = nullptr;
-  int64_t ret_dim = -1;
-
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, &ret_sizes), Error::Ok);
-  EXPECT_NE(ret_sizes, nullptr);
-
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &ret_strides), Error::Ok);
-  EXPECT_NE(ret_strides, nullptr);
-
-  EXPECT_EQ(aoti_torch_get_dim(tensor, &ret_dim), Error::Ok);
-  EXPECT_EQ(ret_dim, 0);
-}
-
-TEST_F(CommonShimsSlimTest, LargeTensor) {
-  std::vector<int64_t> sizes = {100, 200, 300};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-  slim_c10::Device device(slim_c10::DeviceType::CPU, 0);
-
-  Tensor* tensor = new Tensor(slim::empty_strided(
-      slim::makeArrayRef(sizes),
-      slim::makeArrayRef(strides),
-      slim_c10::ScalarType::Float,
-      device));
-  trackTensor(tensor);
-
-  int64_t* ret_sizes = nullptr;
-  int64_t* ret_strides = nullptr;
-
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, &ret_sizes), Error::Ok);
-  EXPECT_EQ(ret_sizes[0], 100);
-  EXPECT_EQ(ret_sizes[1], 200);
-  EXPECT_EQ(ret_sizes[2], 300);
-
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &ret_strides), Error::Ok);
-  EXPECT_EQ(ret_strides[0], 60000); // 200 * 300
-  EXPECT_EQ(ret_strides[1], 300); // 300
-  EXPECT_EQ(ret_strides[2], 1);
-}
-
-TEST_F(CommonShimsSlimTest, ConsistentPointerReturn) {
-  Tensor* tensor = createTestTensor({2, 3, 4}, slim_c10::DeviceType::CPU);
-
-  // Multiple calls should return the same pointer (for SlimTensor)
-  int64_t* sizes_ptr1 = nullptr;
-  int64_t* sizes_ptr2 = nullptr;
-
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok);
-  EXPECT_EQ(sizes_ptr1, sizes_ptr2);
-
-  int64_t* strides_ptr1 = nullptr;
-  int64_t* strides_ptr2 = nullptr;
-
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok);
-  EXPECT_EQ(strides_ptr1, strides_ptr2);
-}
-
-// ============================================================================
-// DType Constants Tests
-// ============================================================================
-
-TEST_F(CommonShimsSlimTest, DTypeConstants) {
-  // Verify dtype constants match expected PyTorch ScalarType values
-  EXPECT_EQ(aoti_torch_dtype_float32(), 6); // ScalarType::Float
-  EXPECT_EQ(aoti_torch_dtype_bfloat16(), 15); // ScalarType::BFloat16
-  EXPECT_EQ(aoti_torch_dtype_int64(), 4); // ScalarType::Long
-  EXPECT_EQ(aoti_torch_dtype_int32(), 3); // ScalarType::Int
-  EXPECT_EQ(aoti_torch_dtype_int16(), 2); // ScalarType::Short
-  EXPECT_EQ(aoti_torch_dtype_int8(), 1); // ScalarType::Char
-  EXPECT_EQ(aoti_torch_dtype_bool(), 11); // ScalarType::Bool
-}
-
-// ============================================================================
-// Device Type Constants Tests
-// ============================================================================
-
-TEST_F(CommonShimsSlimTest, DeviceTypeConstants) {
-  EXPECT_EQ(aoti_torch_device_type_cpu(), 0); // DeviceType::CPU
-  EXPECT_EQ(aoti_torch_device_type_cuda(), 1); // DeviceType::CUDA
-}
-
-// ============================================================================
-// Grad Mode Tests
-// ============================================================================
-
-TEST_F(CommonShimsSlimTest, GradModeIsEnabled) {
-  // ExecuTorch doesn't support autograd, so should always return false
-  EXPECT_EQ(aoti_torch_grad_mode_is_enabled(), false);
-}
-
-TEST_F(CommonShimsSlimTest, GradModeSetEnabled) {
-  // Setting to false should succeed
-  EXPECT_EQ(aoti_torch_grad_mode_set_enabled(false), Error::Ok);
-
-  // Setting to true should fail (not supported in ExecuTorch)
-  EXPECT_EQ(aoti_torch_grad_mode_set_enabled(true), Error::NotSupported);
-}
diff --git a/backends/aoti/tests/utils.h b/backends/aoti/tests/utils.h
deleted file mode 100644
index 1f26f7e2d51..00000000000
--- a/backends/aoti/tests/utils.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/extension/tensor/tensor.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/result.h>
-#include <vector>
-
-namespace executorch {
-namespace backends {
-namespace aoti {
-namespace test {
-
-// Use the same type aliases as in common_shims.h
-using executorch::runtime::etensor::Tensor;
-
-/**
- * Creates a test tensor with the specified shape and scalar type
- */
-inline std::shared_ptr<Tensor> create_test_tensor(
-    const std::vector<int64_t>& sizes,
-    exec_aten::ScalarType dtype = exec_aten::ScalarType::Float) {
-  // Calculate total number of elements
-  int64_t total_elements = 1;
-  for (int64_t size : sizes) {
-    total_elements *= size;
-  }
-
-  // Calculate strides (row-major layout)
-  std::vector<int64_t> strides(sizes.size());
-  if (sizes.size() > 0) {
-    strides[sizes.size() - 1] = 1;
-    for (int i = sizes.size() - 2; i >= 0; i--) {
-      strides[i] = strides[i + 1] * sizes[i + 1];
-    }
-  }
-
-  // Allocate data buffer
-  size_t dtype_size = exec_aten::elementSize(dtype);
-  void* data = malloc(total_elements * dtype_size);
-
-  // Convert sizes and strides to the required type
-  std::vector<executorch::aten::SizesType> sizes_converted(
-      sizes.begin(), sizes.end());
-  std::vector<executorch::aten::SizesType> strides_converted(
-      strides.begin(), strides.end());
-
-  // Create the tensor with the correct argument types and count
-  auto tensor = executorch::extension::from_blob(
-      data, sizes_converted, strides_converted, dtype);
-
-  return tensor;
-}
-
-/**
- * Helper to clean up tensor data that was allocated with malloc
- */
-inline void free_tensor_data(Tensor* tensor) {
-  if (tensor && tensor->mutable_data_ptr()) {
-    free(tensor->mutable_data_ptr());
-  }
-}
-
-} // namespace test
-} // namespace aoti
-} // namespace backends
-} // namespace executorch
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index c85e07d4b59..1cb8bf78c4c 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -98,6 +98,7 @@ install(
 )
 
 # CUDA-specific AOTI shim symbols (dynamically linked)
+# Note: common_shims.h is header-only (all functions are inline)
 set(_aoti_cuda_shim_sources
     runtime/shims/memory.cpp runtime/shims/tensor_attribute.cpp
     runtime/guard.cpp runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu
@@ -106,6 +107,9 @@ set(_aoti_cuda_shim_sources
 
 add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources})
 
+# Define CUDA_AVAILABLE to use SlimTensor in common_shims.h
+target_compile_definitions(aoti_cuda_shims PRIVATE CUDA_AVAILABLE=1)
+
 # Define export macros for shared library
 if(MSVC)
   target_compile_definitions(aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS)
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index ad5baa8d83f..0d2e14248df 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -71,34 +71,34 @@ runtime.cxx_library(
 runtime.cxx_library(
     name = "runtime_shims",
     srcs = [
-        "guard.cpp",
         "shims/cuda_guard.cpp",
         "shims/int4mm.cu",
         "shims/memory.cpp",
         "shims/tensor_attribute.cpp",
     ],
     headers = [
-        "guard.h",
         "shims/cuda_guard.h",
         "shims/int4mm.cuh",
         "shims/int4mm.h",
         "shims/memory.h",
         "shims/tensor_attribute.h",
-        "utils.h",
     ],
     # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
     link_whole = True,
     supports_python_dlopen = True,
     # Constructor needed for backend registration.
     compiler_flags = ["-Wno-global-constructors"],
+    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
     visibility = ["PUBLIC"],
     deps = [
         ":tensor_maker",
         "//executorch/backends/aoti:common_shims",
+        "//executorch/backends/aoti/slim/core:slimtensor",
+        "//executorch/backends/aoti/slim/factory:empty",
+        "//executorch/backends/aoti/slim/factory:from_blob",
         "//executorch/runtime/core:core",
         "//executorch/runtime/core/exec_aten:lib",
         "//executorch/runtime/platform:platform",
-        "//executorch/backends/cuda/runtime:cuda_platform",
     ],
     nvcc_flags = get_nvcc_arch_args() + [
         "-_NVCC_HOST_COMPILER_FLAG_",
@@ -109,33 +109,12 @@ runtime.cxx_library(
     ],
 )
 
+# Legacy alias for backward compatibility
 runtime.cxx_library(
     name = "runtime_shims_slim",
-    srcs = [
-        "shims/memory_slim.cpp",
-    ],
-    headers = [
-        "shims/memory_slim.h",
-    ],
-    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
-    link_whole = True,
-    supports_python_dlopen = True,
-    visibility = ["@EXECUTORCH_CLIENTS"],
-    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
-    deps = [
-        "//executorch/backends/aoti/slim/core:slimtensor",
-        "//executorch/backends/aoti/slim/factory:empty",
-        "//executorch/backends/aoti/slim/factory:from_blob",
-        "//executorch/backends/aoti:common_shims",
-        "//executorch/runtime/core:core",
-        "//executorch/runtime/platform:platform",
-    ],
-    nvcc_flags = get_nvcc_arch_args() + [
-        "-_NVCC_HOST_COMPILER_FLAG_",
-        "gcc",
-    ],
-    external_deps = [
-        ("cuda", None, "cuda-lazy"),
+    visibility = ["PUBLIC"],
+    exported_deps = [
+        ":runtime_shims",
     ],
 )
 
@@ -149,10 +128,16 @@ runtime.cxx_library(
     supports_python_dlopen = True,
     # Constructor needed for backend registration.
     compiler_flags = ["-Wno-global-constructors"],
+    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
     visibility = ["PUBLIC"],
     deps = [
-        ":runtime_shims",
+        ":runtime_shims_slim",
         "//executorch/backends/aoti:aoti_common",
+        "//executorch/backends/aoti/slim/core:slimtensor",
+        "//executorch/backends/aoti/slim/factory:empty",
+        "//executorch/backends/aoti/slim/factory:from_blob",
+        "//executorch/backends/aoti/slim/factory:from_etensor",
+        "//executorch/extension/tensor:tensor",
         "//executorch/runtime/backend:interface",
         "//executorch/runtime/core/exec_aten/util:tensor_util",
     ],
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index cd1c6b96f02..4f3bdf6321a 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -19,8 +19,19 @@
 #include <fstream>
 #include <mutex>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
+// Include SlimTensor headers for CUDA backend
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/core/slim_tensor.h>
+#include <executorch/backends/aoti/slim/core/storage.h>
+#include <executorch/backends/aoti/slim/factory/empty.h>
+#include <executorch/backends/aoti/slim/factory/from_blob.h>
+#include <executorch/backends/aoti/slim/factory/from_etensor.h>
+#include <executorch/backends/aoti/slim/util/array_ref_util.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+
 // Include our shim layer headers
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/aoti/common_shims.h>
@@ -52,10 +63,113 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
+// SlimTensor type aliases
+using slim::c10::Device;
+using slim::c10::DeviceType;
+using slim::CPU_DEVICE;
+using slim::DEFAULT_CUDA_DEVICE;
+using slim::DeviceTraits;
+using slim::from_etensor;
+using slim::SlimTensor;
+
 namespace {
 constexpr char kSkipCopyOutputToCpuForMethod[] =
     "skip_copy_output_to_cpu_for_method";
+
+/**
+ * Copies data from a SlimTensor to an ETensor.
+ *
+ * This function converts a SlimTensor back to an ETensor. The ETensor is
+ * assumed to always reside on CPU, so this handles both CPU→CPU and GPU→CPU
+ * copies. The function will resize the ETensor if needed and copy the data.
+ *
+ * @param slim_tensor Pointer to the source SlimTensor (must not be null).
+ * @param etensor Pointer to the destination ETensor (must not be null).
+ * @return Error::Ok on success, or an appropriate error code on failure.
+ */
+inline Error copy_slimtensor_to_etensor(
+    const SlimTensor* slim_tensor,
+    Tensor* etensor) {
+  ET_CHECK_OR_RETURN_ERROR(
+      slim_tensor != nullptr,
+      InvalidArgument,
+      "copy_slimtensor_to_etensor: slim_tensor pointer cannot be nullptr");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      etensor != nullptr,
+      InvalidArgument,
+      "copy_slimtensor_to_etensor: etensor pointer cannot be nullptr");
+
+  // Check storage_offset is 0 (ETensor does not support storage offset)
+  ET_CHECK_OR_RETURN_ERROR(
+      slim_tensor->storage_offset() == 0,
+      InvalidArgument,
+      "copy_slimtensor_to_etensor: SlimTensor storage_offset must be 0, got %ld",
+      static_cast<long>(slim_tensor->storage_offset()));
+
+  // Check that SlimTensor is contiguous
+  ET_CHECK_OR_RETURN_ERROR(
+      slim_tensor->is_contiguous(),
+      InvalidArgument,
+      "copy_slimtensor_to_etensor: SlimTensor must be contiguous");
+
+  // Check dtype matches
+  slim::c10::ScalarType slim_dtype = slim_tensor->dtype();
+  executorch::runtime::etensor::ScalarType etensor_dtype = etensor->scalar_type();
+  ET_CHECK_OR_RETURN_ERROR(
+      static_cast<int>(slim_dtype) == static_cast<int>(etensor_dtype),
+      InvalidArgument,
+      "copy_slimtensor_to_etensor: dtype mismatch, SlimTensor dtype %d != ETensor dtype %d",
+      static_cast<int>(slim_dtype),
+      static_cast<int>(etensor_dtype));
+
+  // Check dimensions match
+  ET_CHECK_OR_RETURN_ERROR(
+      static_cast<ssize_t>(slim_tensor->dim()) == etensor->dim(),
+      InvalidArgument,
+      "copy_slimtensor_to_etensor: dimension mismatch, SlimTensor dim %zu != ETensor dim %zd",
+      slim_tensor->dim(),
+      etensor->dim());
+
+  // Convert sizes from int64_t to SizesType (int32_t) for resize
+  const size_t ndim = slim_tensor->dim();
+  std::vector<executorch::runtime::etensor::TensorImpl::SizesType> new_sizes(
+      ndim);
+  auto slim_sizes = slim_tensor->sizes();
+  for (size_t i = 0; i < ndim; ++i) {
+    new_sizes[i] = static_cast<
+        executorch::runtime::etensor::TensorImpl::SizesType>(slim_sizes[i]);
+  }
+
+  // Resize ETensor to match SlimTensor sizes
+  Error resize_err = executorch::ET_RUNTIME_NAMESPACE::resize_tensor(
+      *etensor,
+      executorch::runtime::ArrayRef<
+          executorch::runtime::etensor::TensorImpl::SizesType>(
+          new_sizes.data(), new_sizes.size()));
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      resize_err, "copy_slimtensor_to_etensor: failed to resize ETensor");
+
+  // Copy data from SlimTensor to ETensor
+  // SlimTensor may be on GPU or CPU, ETensor is always on CPU
+  size_t nbytes = slim_tensor->nbytes();
+  if (nbytes > 0) {
+    void* dst_data = etensor->mutable_data_ptr();
+    const void* src_data = slim_tensor->data_ptr();
+
+    if (slim_tensor->is_cpu()) {
+      // CPU → CPU copy
+      std::memcpy(dst_data, src_data, nbytes);
+    } else {
+      // GPU → CPU copy
+      DeviceTraits<slim::c10::DeviceType::CUDA>::memcpy(
+          dst_data, src_data, nbytes, CPU_DEVICE, slim_tensor->device());
+    }
+  }
+
+  return Error::Ok;
 }
+} // anonymous namespace
 
 class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
@@ -285,87 +399,76 @@ class ET_EXPERIMENTAL CudaBackend final
         n_outputs,
         args.size())
 
-    // NOTE: ExecuTorch tensors are always on CPU/host memory
-    // We need to create GPU copies for CUDA kernel execution
-    std::vector<AOTITensorHandle> gpu_inputs(
-        n_inputs); // GPU copies for kernel execution
-    std::vector<AOTITensorHandle> gpu_outputs(
-        n_outputs); // GPU tensors for kernel output
+    // NOTE: ExecuTorch tensors maybe on CPU or GPU due to the skip-copy optimization
+    // We need to create GPU copies for CUDA kernel execution using SlimTensor
+    std::vector<SlimTensor> gpu_input_tensors(n_inputs);
+    std::vector<SlimTensor*> gpu_inputs(n_inputs);
+    std::vector<SlimTensor> gpu_output_tensors(n_outputs);
+    std::vector<SlimTensor*> gpu_outputs(n_outputs);
 
-    // Process input tensors: ExecuTorch provides CPU tensors, create GPU
-    // copies
+    // Process input tensors: convert ETensor (CPU) to SlimTensor (GPU)
     for (size_t i = 0; i < n_inputs; i++) {
-      // Get tensor dimensions and properties from ExecuTorch CPU tensor
-      auto cpu_tensor = &(args[i]->toTensor());
-      auto sizes = cpu_tensor->sizes();
-      auto scalar_type = cpu_tensor->scalar_type();
-
-      // Create GPU tensor with same shape
-      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+      auto* cpu_tensor = &(args[i]->toTensor());
+
+      // Check if input data is already on GPU (skip-copy optimization for inputs)
+      // This can happen when the caller has pre-staged data on GPU
+      cudaPointerAttributes attributes{};
+      const void* data_ptr = cpu_tensor->const_data_ptr();
+      if (data_ptr != nullptr) {
+        cudaError_t err = cudaPointerGetAttributes(&attributes, data_ptr);
+        if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) {
+          // Data is already on GPU - wrap it directly without copy
+          auto sizes = cpu_tensor->sizes();
+          auto strides = cpu_tensor->strides();
+          std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+          std::vector<int64_t> strides_vec(strides.begin(), strides.end());
+
+          gpu_input_tensors[i] = slim::from_blob(
+              const_cast<void*>(data_ptr),
+              slim::makeArrayRef(sizes_vec),
+              slim::makeArrayRef(strides_vec),
+              static_cast<slim::c10::ScalarType>(cpu_tensor->scalar_type()),
+              DEFAULT_CUDA_DEVICE,
+              0 // storage_offset
+          );
+          gpu_inputs[i] = &gpu_input_tensors[i];
+          continue;
+        }
+      }
 
-      AOTITensorHandle gpu_input_handle;
-      Error create_err = aoti_torch_empty_strided(
-          sizes_vec.size(),
-          sizes_vec.data(),
-          nullptr, // use default strides
-          static_cast<int32_t>(scalar_type),
-          1, // device_type = cuda
-          0, // device_index = 0
-          &gpu_input_handle);
-
-      ET_CHECK_OR_RETURN_ERROR(
-          create_err == Error::Ok,
-          Internal,
-          "Failed to create GPU tensor for input %d",
-          i);
-
-      gpu_inputs[i] = gpu_input_handle;
-
-      // Copy data from CPU to GPU
-      ET_CHECK_OR_RETURN_ERROR(
-          aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
-          Internal,
-          "Failed to copy input %d from CPU to GPU",
-          i);
+      // Data is on CPU - use from_etensor to copy to GPU
+      gpu_input_tensors[i] =
+          from_etensor(*cpu_tensor, CPU_DEVICE, DEFAULT_CUDA_DEVICE);
+      gpu_inputs[i] = &gpu_input_tensors[i];
     }
-    // Process output tensors: create GPU counterparts for ExecuTorch CPU
-    // tensors
+
+    // Process output tensors: create GPU SlimTensors for kernel output
     for (size_t i = 0; i < n_outputs; i++) {
-      // Get output tensor dimensions from ExecuTorch CPU tensor
-      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor());
       auto sizes = cpu_output_tensor->sizes();
+      auto strides = cpu_output_tensor->strides();
       auto scalar_type = cpu_output_tensor->scalar_type();
 
-      // Create GPU tensor with same shape for kernel output
       std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
-
-      AOTITensorHandle gpu_output_handle;
-      Error create_err = aoti_torch_empty_strided(
-          sizes_vec.size(),
-          sizes_vec.data(),
-          nullptr, // use default strides
-          static_cast<int32_t>(scalar_type),
-          1, // device_type = cuda
-          0, // device_index = 0
-          &gpu_output_handle);
-
-      ET_CHECK_OR_RETURN_ERROR(
-          create_err == Error::Ok,
-          Internal,
-          "Failed to create GPU tensor for output %d",
-          i);
-
-      gpu_outputs[i] = gpu_output_handle;
+      std::vector<int64_t> strides_vec(strides.begin(), strides.end());
+
+      gpu_output_tensors[i] = slim::empty_strided(
+          slim::makeArrayRef(sizes_vec),
+          slim::makeArrayRef(strides_vec),
+          static_cast<slim::c10::ScalarType>(scalar_type),
+          DEFAULT_CUDA_DEVICE);
+      gpu_outputs[i] = &gpu_output_tensors[i];
     }
-    // Run AOTI container with GPU tensors
+
+    // Run AOTI container with GPU SlimTensors
     AOTIRuntimeError error = handle->run(
         handle->container_handle,
-        gpu_inputs.data(), // Use GPU input tensors
+        reinterpret_cast<Tensor**>(gpu_inputs.data()),
         n_inputs,
-        gpu_outputs.data(), // Use GPU output tensors
+        reinterpret_cast<Tensor**>(gpu_outputs.data()),
         n_outputs,
-        handle->cuda_stream, // Pass the actual CUDA stream
-        nullptr); // proxy_executor_handle can remain nullptr
+        handle->cuda_stream,
+        nullptr);
 
     ET_CHECK_OR_RETURN_ERROR(
         error == Error::Ok,
@@ -376,22 +479,53 @@ class ET_EXPERIMENTAL CudaBackend final
     const bool copy_outputs = !should_skip_copy_for_method(handle->method_name);
 
     if (copy_outputs) {
-      // Copy GPU output results back to CPU output tensors
+      // Copy GPU SlimTensor results back to CPU ETensors
       for (size_t i = 0; i < n_outputs; i++) {
-        auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
-        // For DYNAMIC_BOUND tensors we try to resize
+        auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor());
         ET_CHECK_OK_OR_RETURN_ERROR(
-            resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
-            "Error resizing tensor at output index %d",
-            i);
-        ET_CHECK_OK_OR_RETURN_ERROR(
-            aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
-            "Failed to copy GPU output %d back to CPU",
+            copy_slimtensor_to_etensor(gpu_outputs[i], cpu_output_tensor),
+            "Failed to copy GPU output %zu back to CPU ETensor",
             i);
       }
     } else {
-      for (size_t i = 0; i < n_outputs; i++) {
-        args[i + n_inputs]->toTensor() = *gpu_outputs[i];
+      // Skip-copy optimization: wrap GPU data as ETensor using from_blob
+      // The caller is responsible for handling GPU data directly
+      {
+        std::lock_guard<std::mutex> guard(cached_outputs_mutex_);
+        auto& cached_outputs = cached_outputs_[handle];
+
+        // Clear cached outputs for previous round
+        cached_outputs.clear();
+        for (size_t i = 0; i < n_outputs; i++) {
+          // Move output SlimTensors to cached_outputs for lifetime management
+          cached_outputs.push_back(std::move(gpu_output_tensors[i]));
+
+          // Create an ETensor wrapper pointing to the GPU data
+          // The data stays on GPU and the caller handles it
+          SlimTensor& cached = cached_outputs.back();
+          auto slim_sizes = cached.sizes();
+          auto slim_strides = cached.strides();
+
+          std::vector<executorch::aten::SizesType> et_sizes(cached.dim());
+          std::vector<executorch::aten::StridesType> et_strides(cached.dim());
+          for (size_t d = 0; d < cached.dim(); d++) {
+            et_sizes[d] =
+                static_cast<executorch::aten::SizesType>(slim_sizes[d]);
+            et_strides[d] =
+                static_cast<executorch::aten::StridesType>(slim_strides[d]);
+          }
+
+          // Use tensor_ptr_maker to create a non-owning ETensor wrapper
+          // Note: This creates a view into the SlimTensor's GPU memory
+          auto tensor_ptr = executorch::extension::from_blob(
+              cached.data_ptr(),
+              std::move(et_sizes),
+              std::move(et_strides),
+              static_cast<executorch::aten::ScalarType>(cached.dtype()));
+
+          // Assign the wrapped tensor to the output EValue
+          args[i + n_inputs]->toTensor() = *tensor_ptr;
+        }
       }
     }
 
@@ -424,9 +558,12 @@ class ET_EXPERIMENTAL CudaBackend final
     // AOTInductorModelContainerDelete(handle->container_handle);
 
     // Now close the shared library
-    auto err = Error::Ok;
     if (handle->so_handle != nullptr) {
-      err = close_library(handle->so_handle);
+      Error err = close_library(handle->so_handle);
+      ET_CHECK_OR_LOG_ERROR(
+          err == Error::Ok,
+          "Failed to close shared library for %s",
+          handle->so_path.c_str());
     }
 
     // Remove the temporary shared library file
@@ -441,12 +578,19 @@ class ET_EXPERIMENTAL CudaBackend final
     }
 
     delete handle;
-    clear_all_tensors();
   }
 
  private:
   mutable std::mutex skip_copy_method_mutex_;
   std::string skip_copy_method_;
+
+  // Cached output tensors for skip-copy optimization.
+  // When copy-skip is enabled, output SlimTensors are cached here to keep
+  // GPU memory alive while the caller processes the results.
+  // Maps from AOTIDelegateHandle* to its cached outputs.
+  mutable std::mutex cached_outputs_mutex_;
+  mutable std::unordered_map<AOTIDelegateHandle*, std::vector<SlimTensor>>
+      cached_outputs_;
 };
 
 } // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/guard.h b/backends/cuda/runtime/guard.h
index 3f187000f90..2f0fb8f7546 100644
--- a/backends/cuda/runtime/guard.h
+++ b/backends/cuda/runtime/guard.h
@@ -19,8 +19,8 @@ namespace executorch::backends::cuda {
 using executorch::runtime::Error;
 using executorch::runtime::Result;
 
-// Type alias for device index
-using DeviceIndex = int32_t;
+// Signed device index type matching DeviceIndex in slim tensor library
+using DeviceIndex = int8_t;
 
 /**
  * Set the current CUDA stream for the specified device.
diff --git a/backends/cuda/runtime/shims/int4mm.cuh b/backends/cuda/runtime/shims/int4mm.cuh
index ee12fb51004..8ee3fcb957e 100644
--- a/backends/cuda/runtime/shims/int4mm.cuh
+++ b/backends/cuda/runtime/shims/int4mm.cuh
@@ -1177,13 +1177,14 @@ Tensor* _weight_int4pack_mm_cuda(
   ET_CHECK(B_innerKTiles == 2 || B_innerKTiles == 4 || B_innerKTiles == 8);
 
   // A is standard row major
-  ET_CHECK(A.dtype() == executorch::aten::ScalarType::BFloat16);
+  // SlimTensor::dtype() returns slim::c10::ScalarType, cast to int32_t for comparison
+  ET_CHECK(static_cast<int32_t>(A.dtype()) == static_cast<int32_t>(SupportedDTypes::BFLOAT16));
   // ET only supports contiguous tensors for now
   // ET_CHECK(A.is_contiguous());
   ET_CHECK(A.dim() == 2);
 
   // B has B_innerKTiles k-tiles in the innermost dimension
-  ET_CHECK(B.dtype() == executorch::aten::ScalarType::Int);
+  ET_CHECK(static_cast<int32_t>(B.dtype()) == static_cast<int32_t>(SupportedDTypes::INT32));
   // ET only supports contiguous tensors for now
   // ET_CHECK(B.is_contiguous());
   ET_CHECK(B.dim() == 4);
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index 86f6cdd6396..c10cbc3ad7f 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -6,104 +6,26 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/aoti/utils.h>
-#include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
-#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/tensor/tensor_maker.h>
-#include <executorch/backends/cuda/runtime/utils.h>
-#include <executorch/runtime/platform/log.h>
-#include <cstdint>
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
 
-namespace executorch::backends::cuda {
-
-using executorch::aten::SizesType;
-using executorch::aten::StridesType;
-using executorch::backends::aoti::aoti_torch_dtype_bool;
-using executorch::backends::aoti::aoti_torch_get_device_index;
-using executorch::backends::aoti::aoti_torch_get_dtype;
-using executorch::backends::aoti::aoti_torch_get_sizes;
-using executorch::backends::aoti::aoti_torch_get_strides;
-using executorch::backends::aoti::convert_sizes_to_vector;
-using executorch::backends::aoti::convert_strides_to_vector;
-using executorch::backends::aoti::dtype_to_element_size;
-using executorch::backends::aoti::dtype_to_scalar_type;
-using executorch::backends::aoti::validate_storage_offset;
-
-// Global storage for tensors and their metadata
-std::unordered_set<std::shared_ptr<Tensor>> tensors;
-
-// Reference counting for memory addresses
-// Maps memory address to number of tensors using it
-// Special value: NOT_OWN (-1) means tensor never owns the memory
-constexpr int32_t NOT_OWN = -1;
-std::unordered_map<void*, int32_t> memory_to_n_tensor;
-
-namespace {
-
-// Calculate linear offset from strides and indices
-int64_t calculate_linear_offset(
-    const int64_t* indices,
-    const int64_t* strides,
-    int64_t ndim) {
-  int64_t offset = 0;
-  for (int64_t i = 0; i < ndim; ++i) {
-    offset += indices[i] * strides[i];
-  }
-  return offset;
-}
-
-// Convert linear index to multi-dimensional indices based on sizes
-void linear_to_indices(
-    int64_t linear_idx,
-    const int64_t* sizes,
-    int64_t ndim,
-    int64_t* indices) {
-  for (int64_t i = ndim - 1; i >= 0; --i) {
-    indices[i] = linear_idx % sizes[i];
-    linear_idx /= sizes[i];
-  }
-}
+#include <executorch/backends/aoti/slim/factory/empty.h>
+#include <executorch/backends/aoti/slim/factory/from_blob.h>
+#include <executorch/backends/aoti/slim/util/array_ref_util.h>
+#include <executorch/runtime/platform/assert.h>
 
-// Generic pointwise copy function that handles arbitrary strides
-template <typename T>
-AOTITorchError pointwise_copy_generic(
-    T* dst_data,
-    const T* src_data,
-    const int64_t* dst_sizes,
-    const int64_t* dst_strides,
-    const int64_t* src_sizes,
-    const int64_t* src_strides,
-    int64_t dst_ndim,
-    int64_t src_ndim,
-    int64_t total_elements) {
-  std::vector<int64_t> dst_indices(dst_ndim);
-  std::vector<int64_t> src_indices(src_ndim);
-
-  for (int64_t linear_idx = 0; linear_idx < total_elements; ++linear_idx) {
-    // Convert linear index to multi-dimensional indices for both tensors
-    linear_to_indices(linear_idx, dst_sizes, dst_ndim, dst_indices.data());
-    linear_to_indices(linear_idx, src_sizes, src_ndim, src_indices.data());
-
-    // Calculate offsets for both source and destination
-    int64_t src_offset =
-        calculate_linear_offset(src_indices.data(), src_strides, src_ndim);
-    int64_t dst_offset =
-        calculate_linear_offset(dst_indices.data(), dst_strides, dst_ndim);
-
-    // Copy element
-    dst_data[dst_offset] = src_data[src_offset];
-  }
+namespace executorch::backends::cuda {
 
-  return Error::Ok;
-}
+namespace c10 = executorch::backends::aoti::slim::c10;
+using c10::Device;
+using c10::DeviceIndex;
+using c10::DeviceType;
+using c10::ScalarType;
+using executorch::backends::aoti::slim::empty_strided;
+using executorch::backends::aoti::slim::from_blob;
+using executorch::backends::aoti::slim::IntArrayRef;
 
-} // anonymous namespace
+// Use SlimTensor directly to avoid naming conflicts with ETensor
+using SlimTensor = executorch::backends::aoti::slim::SlimTensor;
 
 extern "C" {
 
@@ -116,109 +38,43 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     int32_t dtype,
     int32_t device_type,
     int32_t device_index,
-    Tensor** ret_new_tensor,
+    SlimTensor** ret_new_tensor,
     int32_t layout,
     const uint8_t* opaque_metadata,
     int64_t opaque_metadata_size) {
-  (void)opaque_metadata;
+  // Unused parameters
   (void)layout;
+  (void)opaque_metadata;
   (void)opaque_metadata_size;
 
-  // Validate input parameters first
   ET_CHECK_OR_RETURN_ERROR(
       data != nullptr,
       InvalidArgument,
-      "aoti_torch_create_tensor_from_blob_v2 failed: data pointer is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      !(sizes_ptr == nullptr && ndim > 0),
-      InvalidArgument,
-      "aoti_torch_create_tensor_from_blob_v2 failed: sizes_ptr is null");
+      "aoti_torch_create_tensor_from_blob_v2: data is null");
 
   ET_CHECK_OR_RETURN_ERROR(
       ret_new_tensor != nullptr,
       InvalidArgument,
-      "aoti_torch_create_tensor_from_blob_v2 failed: ret_new_tensor is null");
-
-  // Check that device_index is always 0
-  ET_CHECK_OR_RETURN_ERROR(
-      device_index == 0,
-      InvalidArgument,
-      "device_index must be 0, got: %d",
-      device_index);
-
-  // Validate dtype using SupportedDTypes from utils.h
-  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
-
-  // Storage offset must be 0 since from_blob cannot handle different offsets
-  ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset));
-
-  // Verify that data pointer location matches the requested device_type
-  cudaPointerAttributes data_attributes{};
-  ET_CUDA_CHECK_OR_RETURN_ERROR(
-      cudaPointerGetAttributes(&data_attributes, data));
-
-  bool data_is_on_device = data_attributes.type == cudaMemoryTypeDevice;
-  bool data_is_on_host = data_attributes.type == cudaMemoryTypeHost ||
-      data_attributes.type == cudaMemoryTypeUnregistered;
-  bool requested_device =
-      device_type == static_cast<int32_t>(SupportedDevices::CUDA);
-  bool requested_cpu =
-      device_type == static_cast<int32_t>(SupportedDevices::CPU);
-
-  // Error if data location doesn't match requested device type
-  ET_CHECK_OR_RETURN_ERROR(
-      !(data_is_on_device && requested_cpu),
-      InvalidArgument,
-      "aoti_torch_create_tensor_from_blob_v2 failed: data pointer %p is on CUDA "
-      "but device_type is CPU. Data must be on CPU for CPU tensors.",
-      data);
+      "aoti_torch_create_tensor_from_blob_v2: ret_new_tensor is null");
 
   ET_CHECK_OR_RETURN_ERROR(
-      !(data_is_on_host && requested_device),
+      !(sizes_ptr == nullptr && ndim > 0),
       InvalidArgument,
-      "aoti_torch_create_tensor_from_blob_v2 failed: data pointer %p is on CPU "
-      "but device_type is CUDA. Data must be on GPU for CUDA tensors.",
-      data);
-
-  // Convert sizes to the format expected by ExecutorTorch using SizesType
-  std::vector<executorch::aten::SizesType> sizes =
-      convert_sizes_to_vector(ndim, sizes_ptr);
-
-  // Convert strides using the common helper function with StridesType
-  std::vector<executorch::aten::StridesType> strides =
-      convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
-
-  // Create ExecutorTorch tensor that wraps the existing memory
-  // Note: We're NOT copying the data, just wrapping it
-  // Using CUDA-specific tensor maker that supports incontiguous tensors
-  auto tensor = make_tensor(
-      sizes, // tensor dimensions
-      data, // existing memory (don't copy!)
-      {}, // dim_order (empty, will be auto-generated)
-      strides, // tensor strides (allows different strides)
-      dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType
-  );
-
-  ET_CHECK_OR_RETURN_ERROR(
-      tensor != nullptr, InvalidArgument, "Failed to create tensor from blob");
+      "aoti_torch_create_tensor_from_blob_v2: sizes_ptr is null but ndim > 0");
 
-  // Store the tensor so it doesn't get destroyed
-  tensors.insert(tensor);
-
-  *ret_new_tensor = tensor.get();
-
-  // Check if this memory address is already being tracked
-  auto memory_it = memory_to_n_tensor.find(data);
-  ET_CHECK_OR_RETURN_ERROR(
-      memory_it == memory_to_n_tensor.end(),
-      InvalidArgument,
-      "Memory address %p is already being tracked by another tensor",
-      data);
+  IntArrayRef sizes(sizes_ptr, static_cast<size_t>(ndim));
+  IntArrayRef strides(strides_ptr, static_cast<size_t>(ndim));
 
-  // Mark this memory as NOT_OWN since tensor created from blob never owns
-  // memory
-  memory_to_n_tensor[data] = NOT_OWN;
+  // Create the SlimTensor using from_blob (non-owning)
+  *ret_new_tensor = new SlimTensor(from_blob(
+      data,
+      sizes,
+      strides,
+      static_cast<ScalarType>(dtype),
+      Device(
+          static_cast<DeviceType>(device_type),
+          static_cast<DeviceIndex>(device_index)),
+      storage_offset));
 
   return Error::Ok;
 }
@@ -230,697 +86,177 @@ AOTITorchError aoti_torch_empty_strided(
     int32_t dtype,
     int32_t device_type,
     int32_t device_index,
-    Tensor** ret_new_tensor) {
-  // Check that device_index is always 0
+    SlimTensor** ret_new_tensor) {
   ET_CHECK_OR_RETURN_ERROR(
-      device_index == 0,
+      ret_new_tensor != nullptr,
       InvalidArgument,
-      "device_index must be 0, got: %d",
-      device_index);
-
-  // This requires us to reserve CUDA memory and put it into a ETensor
-  void* ptr;
+      "aoti_torch_empty_strided: ret_new_tensor is null");
 
-  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
-
-  size_t element_size = dtype_to_element_size(dtype);
   ET_CHECK_OR_RETURN_ERROR(
-      element_size != 0,
+      !(sizes_ptr == nullptr && ndim > 0),
       InvalidArgument,
-      "Invalid element size for dtype: %d",
-      dtype);
-
-  // Calculate storage size based on strides, matching PyTorch's behavior
-  // This is critical when sizes and strides don't match the expected contiguous
-  // layout Reference: PyTorch's computeStorageNbytes in EmptyTensor.cpp
-  int64_t storage_size = 1; // storage offset (0) + 1
-  for (int64_t i = 0; i < ndim; i++) {
-    if (sizes_ptr[i] == 0) {
-      storage_size = 0;
-      break;
-    }
-    // For each dimension, add stride[i] * (size[i] - 1)
-    // This gives us the maximum offset in that dimension
-    int64_t stride_i = (strides_ptr != nullptr) ? strides_ptr[i] : 1;
-    if (strides_ptr == nullptr) {
-      // Calculate contiguous stride if not provided
-      for (int64_t j = i + 1; j < ndim; j++) {
-        stride_i *= sizes_ptr[j];
-      }
-    }
-    storage_size += stride_i * (sizes_ptr[i] - 1);
-  }
-  int64_t nbytes = storage_size * element_size;
-
-  if (device_type == static_cast<int32_t>(SupportedDevices::CUDA)) {
-    ET_CUDA_CHECK_OR_RETURN_ERROR(
-        cudaMallocAsync(&ptr, static_cast<size_t>(nbytes), cudaStreamDefault));
-  } else if (device_type == static_cast<int32_t>(SupportedDevices::CPU)) {
-    // Ensure 16-byte alignment for CPU memory to match CUDA requirements
-    ptr = aligned_alloc(16, nbytes);
-    ET_CHECK_OR_RETURN_ERROR(
-        ptr != nullptr,
-        MemoryAllocationFailed,
-        "Failed to allocate aligned CPU memory");
-  } else {
-    ET_CHECK_OR_RETURN_ERROR(
-        false,
-        NotImplemented,
-        "Need to implement empty_strided for non-CUDA non-CPU device type %d",
-        device_type);
-  }
-
-  // ETensor sizes
-  auto sizes = convert_sizes_to_vector(ndim, sizes_ptr);
-
-  // ETensor strides
-  auto strides = convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
-
-  // ETensor creation with dynamic shape support for edge cases
-  // Using CUDA-specific tensor maker that supports incontiguous tensors
-  auto tensor = make_tensor(
+      "aoti_torch_empty_strided: sizes_ptr is null but ndim > 0");
+
+  IntArrayRef sizes(sizes_ptr, static_cast<size_t>(ndim));
+  IntArrayRef strides(strides_ptr, static_cast<size_t>(ndim));
+
+  // Create the SlimTensor using empty_strided (owning)
+  *ret_new_tensor = new SlimTensor(empty_strided(
       sizes,
-      ptr,
-      {}, // dim_order (empty, will be auto-generated)
       strides,
-      dtype_to_scalar_type(dtype));
+      static_cast<ScalarType>(dtype),
+      Device(
+          static_cast<DeviceType>(device_type),
+          static_cast<DeviceIndex>(device_index))));
 
-  // Store the tensor so it doesn't get destroyed
-  tensors.insert(tensor);
-  *ret_new_tensor = tensor.get();
-
-  // This tensor owns the memory it allocated, set reference count to 1
-  memory_to_n_tensor[ptr] = 1;
   return Error::Ok;
 }
 
-void clear_all_tensors() {
-  // Use aoti_torch_delete_tensor_object to properly delete each tensor
-  // Note: We need to collect tensor pointers first since deletion modifies the
-  // set
-  std::vector<Tensor*> tensor_ptrs;
-  tensor_ptrs.reserve(tensors.size());
-  for (const auto& tensor_shared : tensors) {
-    tensor_ptrs.push_back(tensor_shared.get());
-  }
-
-  // Now delete each tensor - this will modify the global tensors set
-  for (Tensor* tensor_ptr : tensor_ptrs) {
-    aoti_torch_delete_tensor_object(tensor_ptr);
-  }
-
-  // tensors set should now be empty, but ensure it's cleared
-  tensors.clear();
-
-  // Clear memory tracking map (includes leftover NOT_OWN entries)
-  memory_to_n_tensor.clear();
-
-  ET_LOG(Info, "Cleared all tensors and memory tracking");
-}
-
-AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
-  // Handle null tensor pointer
-  ET_CHECK_OR_RETURN_ERROR(
-      tensor != nullptr, InvalidArgument, "Cannot delete null tensor");
-
-  // Check if tensor exists in our tracking
-  bool found_in_tensors = false;
-  for (auto it = tensors.begin(); it != tensors.end(); ++it) {
-    if (it->get() == tensor) {
-      found_in_tensors = true;
-      break;
-    }
-  }
-
-  // If tensor not found in our tracking, it's invalid
+AOTITorchError aoti_torch_delete_tensor_object(SlimTensor* tensor) {
   ET_CHECK_OR_RETURN_ERROR(
-      found_in_tensors, InvalidArgument, "Didn't find tensor %p", tensor);
-
-  // Find and delete the tensor
-  for (auto it = tensors.begin(); it != tensors.end(); ++it) {
-    if (it->get() == tensor) {
-      // Get the tensor before erasing
-      auto tensor_ptr = *it;
-      void* data_ptr = tensor_ptr->mutable_data_ptr();
-
-      // Find the reference count for this memory address
-      auto memory_it = memory_to_n_tensor.find(data_ptr);
-      if (memory_it != memory_to_n_tensor.end()) {
-        int32_t ref_count = memory_it->second;
-
-        if (ref_count == NOT_OWN) {
-          // Tensor never owned the memory, skip freeing
-          // Just remove tensor from tracking
-          tensors.erase(it);
-          return Error::Ok;
-        } else if (ref_count == 1) {
-          // Only current tensor using this memory, free it
-          // Determine if it's GPU memory
-          cudaPointerAttributes attributes{};
-          ET_CUDA_CHECK_OR_RETURN_ERROR(
-              cudaPointerGetAttributes(&attributes, data_ptr));
-
-          if (attributes.type == cudaMemoryTypeDevice) {
-            ET_CUDA_CHECK_OR_RETURN_ERROR(
-                cudaFreeAsync(data_ptr, cudaStreamDefault));
-          } else {
-            ET_CHECK_OR_RETURN_ERROR(
-                attributes.type != cudaMemoryTypeManaged,
-                Internal,
-                "Expected host memory but got managed!")
-            // This is CPU memory - free immediately
-            aligned_free(data_ptr);
-            data_ptr = nullptr;
-          }
-
-          // Remove from memory tracking
-          memory_to_n_tensor.erase(memory_it);
-        } else if (ref_count > 1) {
-          // Other tensors still using this memory, just decrement count
-          memory_to_n_tensor[data_ptr] = ref_count - 1;
-        }
-      } else {
-        ET_CHECK_OR_RETURN_ERROR(
-            false,
-            Internal,
-            "Internal error: memory not found during deletion");
-      }
-
-      // Remove tensor from set (this will call the destructor if it's the last
-      // reference)
-      tensors.erase(it);
-      return Error::Ok;
-    }
-  }
-
-  // This should never be reached since we found it above
-  ET_CHECK_OR_RETURN_ERROR(
-      false, Internal, "Internal error: tensor not found after validation");
-}
-
-AOTITorchError
-aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) {
-  (void)non_blocking;
-
-  // Check for null pointers first
-  ET_CHECK_OR_RETURN_ERROR(
-      self != nullptr,
-      InvalidArgument,
-      "aoti_torch_copy_ failed: self tensor is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      src != nullptr,
+      tensor != nullptr,
       InvalidArgument,
-      "aoti_torch_copy_ failed: src tensor is null");
+      "aoti_torch_delete_tensor_object: tensor is null");
 
-  // Get dtype information and validate compatibility
-  int32_t self_dtype, src_dtype;
-  aoti_torch_get_dtype(self, &self_dtype);
-  aoti_torch_get_dtype(src, &src_dtype);
+  // SlimTensor uses SharedPtr for storage, so simply deleting the tensor
+  // will automatically handle reference counting and free the underlying
+  // storage when no more references exist.
+  delete tensor;
 
-  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(self_dtype));
-
-  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(src_dtype));
+  return Error::Ok;
+}
 
-  // Check dtype compatibility - both tensors must have the same dtype
+AOTITorchError aoti_torch_new_tensor_handle(
+    SlimTensor* orig_handle,
+    SlimTensor** new_handle) {
   ET_CHECK_OR_RETURN_ERROR(
-      self_dtype == src_dtype,
+      orig_handle != nullptr,
       InvalidArgument,
-      "dtype mismatch. self.dtype=%d, src.dtype=%d. aoti_torch_copy_ requires same dtypes",
-      self_dtype,
-      src_dtype);
-
-  // Check total number of elements compatibility (PyTorch copy_ behavior)
-  int64_t self_numel = self->numel();
-  int64_t src_numel = src->numel();
+      "aoti_torch_new_tensor_handle: orig_handle is null");
 
   ET_CHECK_OR_RETURN_ERROR(
-      self_numel == src_numel,
+      new_handle != nullptr,
       InvalidArgument,
-      "numel mismatch. self.numel()=%ld, src.numel()=%ld",
-      self_numel,
-      src_numel);
-
-  // Get tensor metadata
-  int64_t* self_strides;
-  int64_t* src_strides;
-  aoti_torch_get_strides(self, &self_strides);
-  aoti_torch_get_strides(src, &src_strides);
-
-  int64_t* self_sizes;
-  int64_t* src_sizes;
-  aoti_torch_get_sizes(self, &self_sizes);
-  aoti_torch_get_sizes(src, &src_sizes);
-
-  // Determine device locations
-  cudaPointerAttributes srcAttributes{};
-  cudaPointerAttributes dstAttributes{};
-
-  ET_CUDA_CHECK_OR_RETURN_ERROR(
-      cudaPointerGetAttributes(&srcAttributes, src->data_ptr()));
-
-  ET_CUDA_CHECK_OR_RETURN_ERROR(
-      cudaPointerGetAttributes(&dstAttributes, self->data_ptr()));
-
-  bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice;
-  bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice;
-
-  // Check if tensors have the same schema (sizes, strides, dtype) for fast path
-  bool same_schema = true;
-  for (int i = 0; i < self->dim(); i++) {
-    if (self_strides[i] != src_strides[i]) {
-      same_schema = false;
-      break;
-    }
-  }
-
-  size_t total_bytes = src->nbytes();
-  int64_t total_elements = self->numel();
-
-  if (same_schema) {
-    // Fast path: Direct memory copy since layouts match exactly
-    if (srcIsDevice && dstIsDevice) {
-      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
-          self->mutable_data_ptr(),
-          src->data_ptr(),
-          total_bytes,
-          cudaMemcpyDeviceToDevice));
-    } else if (srcIsDevice && !dstIsDevice) {
-      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
-          self->mutable_data_ptr(),
-          src->data_ptr(),
-          total_bytes,
-          cudaMemcpyDeviceToHost));
-    } else if (!srcIsDevice && dstIsDevice) {
-      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
-          self->mutable_data_ptr(),
-          src->data_ptr(),
-          total_bytes,
-          cudaMemcpyHostToDevice));
-    } else {
-      std::memcpy(self->mutable_data_ptr(), src->data_ptr(), total_bytes);
-    }
-  } else {
-    // Fallback path: Pointwise copy with stride-aware indexing
-    // This handles arbitrary tensor layouts and strides
-
-    size_t element_size = dtype_to_element_size(self_dtype);
-    ET_CHECK_OR_RETURN_ERROR(
-        element_size != 0,
-        InvalidArgument,
-        "Invalid element size for dtype: %d",
-        self_dtype);
-
-    // Allocate temporary host memory for GPU tensors
-    float* src_host_data = nullptr;
-    float* dst_host_data = nullptr;
-    bool need_free_src = false;
-    bool need_free_dst = false;
-
-    if (srcIsDevice) {
-      src_host_data =
-          static_cast<float*>(malloc(total_elements * sizeof(float)));
-      ET_CHECK_OR_RETURN_ERROR(
-          src_host_data != nullptr,
-          MemoryAllocationFailed,
-          "Failed to allocate memory for src_host_data");
-      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
-          src_host_data, src->data_ptr(), total_bytes, cudaMemcpyDeviceToHost));
-      need_free_src = true;
-    } else {
-      src_host_data = static_cast<float*>(src->data_ptr());
-    }
-
-    if (dstIsDevice) {
-      dst_host_data =
-          static_cast<float*>(malloc(total_elements * sizeof(float)));
-      if (dst_host_data == nullptr) {
-        if (need_free_src) {
-          free(src_host_data);
-        }
-        ET_CHECK_OR_RETURN_ERROR(
-            false,
-            MemoryAllocationFailed,
-            "Failed to allocate memory for dst_host_data");
-      }
-      need_free_dst = true;
-    } else {
-      dst_host_data = static_cast<float*>(self->mutable_data_ptr());
-    }
-
-    // Perform pointwise copy with stride calculation
-    AOTITorchError copy_err = pointwise_copy_generic(
-        dst_host_data,
-        src_host_data,
-        self_sizes,
-        self_strides,
-        src_sizes,
-        src_strides,
-        self->dim(),
-        src->dim(),
-        total_elements);
-
-    if (copy_err != Error::Ok) {
-      // Clean up temporary buffers before returning
-      if (need_free_src) {
-        free(src_host_data);
-      }
-      if (need_free_dst) {
-        free(dst_host_data);
-      }
-      return copy_err;
-    }
-
-    // Copy result back to device if needed
-    if (dstIsDevice) {
-      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
-          self->mutable_data_ptr(),
-          dst_host_data,
-          total_bytes,
-          cudaMemcpyHostToDevice));
-    }
-
-    // Clean up temporary buffers
-    if (need_free_src) {
-      free(src_host_data);
-    }
-    if (need_free_dst) {
-      free(dst_host_data);
-    }
-  }
+      "aoti_torch_new_tensor_handle: new_handle is null");
+
+  // Create a new SlimTensor that shares the same underlying storage.
+  // SlimTensor's copy constructor shares the SharedPtr<Storage>, so both
+  // tensors will reference the same memory. When the last tensor is deleted,
+  // the storage will be freed.
+  *new_handle = new SlimTensor(*orig_handle);
 
   return Error::Ok;
 }
 
 AOTITorchError aoti_torch__reinterpret_tensor(
-    Tensor* self,
+    SlimTensor* self,
     int64_t ndim,
     const int64_t* sizes_ptr,
     const int64_t* strides_ptr,
     int64_t storage_offset,
-    Tensor** ret_new_tensor) {
-  // Validate input parameters first
+    SlimTensor** ret_new_tensor) {
   ET_CHECK_OR_RETURN_ERROR(
       self != nullptr,
       InvalidArgument,
-      "aoti_torch__reinterpret_tensor failed: self tensor is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      !(sizes_ptr == nullptr && ndim > 0),
-      InvalidArgument,
-      "aoti_torch__reinterpret_tensor failed: sizes_ptr is null");
+      "aoti_torch__reinterpret_tensor: self is null");
 
   ET_CHECK_OR_RETURN_ERROR(
       ret_new_tensor != nullptr,
       InvalidArgument,
-      "aoti_torch__reinterpret_tensor failed: ret_new_tensor is null");
-
-  // Check if storage_offset is not 0 - return error if not
-  ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset));
-
-  // Get the device info from the source tensor to perform device_index
-  // validation
-  int32_t device_type = 0;
-  int32_t device_index = 0;
-  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_type(self, &device_type));
-
-  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_index(self, &device_index));
+      "aoti_torch__reinterpret_tensor: ret_new_tensor is null");
 
-  // Ensure device_index is always 0
   ET_CHECK_OR_RETURN_ERROR(
-      device_index == 0,
+      ndim >= 0,
       InvalidArgument,
-      "device_index must be 0, got: %d",
-      device_index);
+      "aoti_torch__reinterpret_tensor: ndim must be non-negative, got %lld",
+      static_cast<long long>(ndim));
 
-  // Get the dtype from the source tensor
-  int32_t dtype = 0;
-  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(self, &dtype));
-
-  // Validate dtype using SupportedDTypes
-  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
-
-  // Get the original data pointer from the source tensor
-  void* data_ptr = self->mutable_data_ptr();
   ET_CHECK_OR_RETURN_ERROR(
-      data_ptr != nullptr,
-      InvalidArgument,
-      "Source tensor has null data pointer");
-
-  // Check if the given memory is in the map, if not return error
-  auto memory_it = memory_to_n_tensor.find(data_ptr);
-  ET_CHECK_OR_RETURN_ERROR(
-      memory_it != memory_to_n_tensor.end(),
-      InvalidArgument,
-      "Memory address %p is not being tracked by reference counting system",
-      data_ptr);
-
-  // Convert sizes using utility function from utils.h
-  std::vector<SizesType> sizes = convert_sizes_to_vector(ndim, sizes_ptr);
-
-  // Convert strides using utility function from utils.h
-  std::vector<StridesType> strides =
-      convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
-
-  // Create new tensor view that reinterprets the same memory with different
-  // shape/strides This creates a view, not a copy - the data pointer is shared
-  // Using CUDA-specific tensor maker that supports incontiguous tensors
-  std::shared_ptr<Tensor> tensor = make_tensor(
-      sizes, // New sizes with explicit SizesType
-      data_ptr, // Reuse the same memory from source tensor
-      {}, // dim_order (empty, will be auto-generated)
-      strides, // New strides with explicit StridesType
-      dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting
-  );
-
-  ET_CHECK_OR_RETURN_ERROR(
-      tensor != nullptr,
+      !(sizes_ptr == nullptr && ndim > 0),
       InvalidArgument,
-      "Failed to create reinterpreted tensor view");
+      "aoti_torch__reinterpret_tensor: sizes_ptr is null but ndim > 0");
 
-  // Store the tensor so it doesn't get destroyed
-  tensors.insert(tensor);
+  IntArrayRef sizes(sizes_ptr, static_cast<size_t>(ndim));
+  IntArrayRef strides(strides_ptr, static_cast<size_t>(ndim));
 
-  *ret_new_tensor = tensor.get();
-
-  // Increment the reference count for this memory address only if it is owned
-  // by tensor
-  memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN
-      ? NOT_OWN
-      : memory_to_n_tensor[data_ptr] + 1;
+  // Create a new tensor view using as_strided. This creates a tensor that
+  // shares the same underlying storage but with different sizes, strides,
+  // and storage offset. SlimTensor::as_strided() handles this via copy
+  // constructor which shares the SharedPtr<Storage>.
+  *ret_new_tensor =
+      new SlimTensor(self->as_strided(sizes, strides, storage_offset));
 
   return Error::Ok;
 }
 
-AOTITorchError aoti_torch_new_tensor_handle(
-    Tensor* orig_handle,
-    Tensor** new_handle) {
-  // Validate input parameters
-  ET_CHECK_OR_RETURN_ERROR(
-      orig_handle != nullptr,
-      InvalidArgument,
-      "aoti_torch_new_tensor_handle failed: orig_handle is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      new_handle != nullptr,
-      InvalidArgument,
-      "aoti_torch_new_tensor_handle failed: new_handle is null");
-
-  // Get metadata from the original tensor
-  int64_t* sizes_ptr;
-  int64_t* strides_ptr;
-  int32_t dtype;
-  int32_t device_type;
-  int32_t device_index;
-
-  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_sizes(orig_handle, &sizes_ptr));
-  ET_CHECK_OK_OR_RETURN_ERROR(
-      aoti_torch_get_strides(orig_handle, &strides_ptr));
-  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(orig_handle, &dtype));
-  ET_CHECK_OK_OR_RETURN_ERROR(
-      aoti_torch_get_device_type(orig_handle, &device_type));
-  ET_CHECK_OK_OR_RETURN_ERROR(
-      aoti_torch_get_device_index(orig_handle, &device_index));
-
-  int64_t ndim = orig_handle->dim();
-
-  // Validate dtype
-  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
-
-  // Ensure device_index is always 0
-  ET_CHECK_OR_RETURN_ERROR(
-      device_index == 0,
-      InvalidArgument,
-      "device_index must be 0, got: %d",
-      device_index);
-
-  // Get the original data pointer from the source tensor
-  void* data_ptr = orig_handle->mutable_data_ptr();
-  ET_CHECK_OR_RETURN_ERROR(
-      data_ptr != nullptr,
-      InvalidArgument,
-      "Source tensor has null data pointer");
+AOTITorchError
+aoti_torch_copy_(SlimTensor* self, SlimTensor* src, int32_t non_blocking) {
+  (void)non_blocking; // SlimTensor::copy_() is always synchronous for now
 
-  // Check if the given memory is in the map
-  auto memory_it = memory_to_n_tensor.find(data_ptr);
   ET_CHECK_OR_RETURN_ERROR(
-      memory_it != memory_to_n_tensor.end(),
-      InvalidArgument,
-      "Memory address %p is not being tracked by reference counting system",
-      data_ptr);
-
-  // Convert sizes and strides to vectors
-  std::vector<SizesType> sizes = convert_sizes_to_vector(ndim, sizes_ptr);
-  std::vector<StridesType> strides =
-      convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
-
-  // Create new tensor that shares the same memory as the original
-  // This is similar to PyTorch's Tensor copy constructor - creates a new
-  // tensor object that shares the same underlying storage
-  std::shared_ptr<Tensor> tensor = make_tensor(
-      sizes, // Same sizes as original
-      data_ptr, // Share the same memory from source tensor
-      {}, // dim_order (empty, will be auto-generated)
-      strides, // Same strides as original
-      dtype_to_scalar_type(dtype) // Same dtype as original
-  );
+      self != nullptr, InvalidArgument, "aoti_torch_copy_: self is null");
 
   ET_CHECK_OR_RETURN_ERROR(
-      tensor != nullptr, InvalidArgument, "Failed to create new tensor handle");
-
-  // Store the tensor so it doesn't get destroyed
-  tensors.insert(tensor);
+      src != nullptr, InvalidArgument, "aoti_torch_copy_: src is null");
 
-  *new_handle = tensor.get();
-
-  // Increment the reference count for this memory address only if it is owned
-  // by tensor
-  memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN
-      ? NOT_OWN
-      : memory_to_n_tensor[data_ptr] + 1;
+  // SlimTensor::copy_() handles:
+  // - Same numel validation
+  // - Same dtype validation
+  // - CPU-CPU, CPU-CUDA, CUDA-CPU, CUDA-CUDA copies
+  // - Contiguous fast path and non-contiguous element-wise copy
+  self->copy_(*src);
 
   return Error::Ok;
 }
 
-AOTITorchError aoti_torch_item_bool(Tensor* tensor, bool* ret_value) {
-  // Validate input parameters
+AOTITorchError aoti_torch_item_bool(SlimTensor* tensor, bool* ret_value) {
   ET_CHECK_OR_RETURN_ERROR(
       tensor != nullptr,
       InvalidArgument,
-      "aoti_torch_item_bool failed: tensor is null");
+      "aoti_torch_item_bool: tensor is null");
 
   ET_CHECK_OR_RETURN_ERROR(
       ret_value != nullptr,
       InvalidArgument,
-      "aoti_torch_item_bool failed: ret_value is null");
-
-  // Validate that tensor dtype is bool
-  int32_t dtype;
-  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(tensor, &dtype));
+      "aoti_torch_item_bool: ret_value is null");
 
   ET_CHECK_OR_RETURN_ERROR(
-      dtype == aoti_torch_dtype_bool(),
+      tensor->numel() == 1,
       InvalidArgument,
-      "aoti_torch_item_bool failed: tensor dtype is not bool (got %d)",
-      dtype);
+      "aoti_torch_item_bool: tensor must have exactly 1 element, got %zu",
+      tensor->numel());
 
-  // Get the data pointer
-  const void* data_ptr = tensor->const_data_ptr();
   ET_CHECK_OR_RETURN_ERROR(
-      data_ptr != nullptr,
+      tensor->dtype() == ScalarType::Bool,
       InvalidArgument,
-      "aoti_torch_item_bool failed: tensor data pointer is null");
-
-  // Check if tensor is on CUDA or CPU
-  cudaPointerAttributes attributes{};
-  ET_CUDA_CHECK_OR_RETURN_ERROR(
-      cudaPointerGetAttributes(&attributes, data_ptr));
-
-  if (attributes.type == cudaMemoryTypeDevice) {
-    // CUDA memory case: copy from device to host
-    bool device_value;
-    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
-        &device_value, data_ptr, sizeof(bool), cudaMemcpyDeviceToHost));
-    *ret_value = device_value;
-  } else {
-    // CPU memory case: direct access
-    const bool* bool_ptr = static_cast<const bool*>(data_ptr);
-    *ret_value = *bool_ptr;
-  }
+      "aoti_torch_item_bool: tensor dtype must be Bool");
+
+  // SlimTensor::item<T>() handles both CPU and CUDA tensors.
+  // For CUDA tensors, it copies the value to CPU automatically.
+  *ret_value = tensor->item<bool>();
 
   return Error::Ok;
 }
 
-AOTITorchError aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst) {
-  // Validate input parameters
+AOTITorchError aoti_torch_assign_tensors_out(SlimTensor* src, SlimTensor** ret_dst) {
   ET_CHECK_OR_RETURN_ERROR(
       src != nullptr,
       InvalidArgument,
-      "aoti_torch_assign_tensors_out failed: src is null");
+      "aoti_torch_assign_tensors_out: src is null");
 
   ET_CHECK_OR_RETURN_ERROR(
       ret_dst != nullptr,
       InvalidArgument,
-      "aoti_torch_assign_tensors_out failed: ret_dst is null");
-
-  // Get the data pointer from the source tensor
-  void* data_ptr = src->mutable_data_ptr();
-  ET_CHECK_OR_RETURN_ERROR(
-      data_ptr != nullptr,
-      InvalidArgument,
-      "Source tensor has null data pointer");
+      "aoti_torch_assign_tensors_out: ret_dst is null");
 
-  // Check if the given memory is in the map, if not return error
-  auto memory_it = memory_to_n_tensor.find(data_ptr);
-  ET_CHECK_OR_RETURN_ERROR(
-      memory_it != memory_to_n_tensor.end(),
-      InvalidArgument,
-      "Memory address %p is not being tracked by reference counting system",
-      data_ptr);
-
-  // Get dtype from source tensor
-  int32_t dtype = 0;
-  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(src, &dtype));
-
-  // Get sizes and strides from source tensor
-  int64_t* sizes_ptr;
-  int64_t* strides_ptr;
-  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_sizes(src, &sizes_ptr));
-  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_strides(src, &strides_ptr));
-
-  int64_t ndim = src->dim();
-
-  // Convert to vectors
-  std::vector<SizesType> sizes = convert_sizes_to_vector(ndim, sizes_ptr);
-  std::vector<StridesType> strides =
-      convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
-
-  // Create new tensor view that shares the same memory as source tensor
-  std::shared_ptr<Tensor> tensor = make_tensor(
-      sizes,
-      data_ptr, // Share the same memory from source tensor
-      {}, // dim_order (empty, will be auto-generated)
-      strides,
-      dtype_to_scalar_type(dtype));
-
-  ET_CHECK_OR_RETURN_ERROR(
-      tensor != nullptr,
-      InvalidArgument,
-      "Failed to create tensor view in aoti_torch_assign_tensors_out");
-
-  // Store the tensor so it doesn't get destroyed
-  tensors.insert(tensor);
-
-  *ret_dst = tensor.get();
-
-  // Increment the reference count for this memory address only if it is owned
-  // by tensor
-  memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN
-      ? NOT_OWN
-      : memory_to_n_tensor[data_ptr] + 1;
+  // Move the source tensor into the destination. After this operation,
+  // the source tensor will be left in an undefined state (reset).
+  // This differs from aoti_torch_new_tensor_handle which copies the tensor.
+  *ret_dst = new SlimTensor(std::move(*src));
 
   return Error::Ok;
 }
+
 } // extern "C"
 
 } // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
index 34b781a5270..036fa5ec6c6 100644
--- a/backends/cuda/runtime/shims/memory.h
+++ b/backends/cuda/runtime/shims/memory.h
@@ -8,15 +8,20 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/aoti/export.h>
 #include <cstdint>
 
+#include <executorch/backends/aoti/export.h>
+#include <executorch/backends/aoti/slim/core/slim_tensor.h>
+#include <executorch/backends/aoti/slim/core/slim_tensor_view_incl.h>
+#include <executorch/runtime/core/error.h>
+
 namespace executorch::backends::cuda {
 
-using executorch::backends::aoti::AOTITorchError;
-using executorch::backends::aoti::Tensor;
+using executorch::runtime::Error;
+using AOTITorchError = Error;
+
+// Use SlimTensor directly in shim APIs to avoid naming conflicts with ETensor
+using SlimTensor = executorch::backends::aoti::slim::SlimTensor;
 
 extern "C" {
 
@@ -28,21 +33,17 @@ extern "C" {
  *
  * @param data Pointer to the memory blob to wrap (must not be null)
  * @param ndim Number of dimensions in the tensor
- * @param sizes_ptr Pointer to array of dimension sizes (using SizesType)
- * @param strides_ptr Pointer to array of strides for each dimension (using
- * StridesType, can be null for contiguous)
- * @param storage_offset Storage offset (must be 0 for current implementation)
- * @param dtype Data type identifier (supports FLOAT32 and BFLOAT16 from
- * SupportedDTypes)
- * @param device_type Device type (CPU=0, CUDA=1 from SupportedDevices)
- * @param device_index Device index (must be 0 for current implementation)
- * @param ret_new_tensor Output parameter for the created tensor (must not be
- * null)
+ * @param sizes_ptr Pointer to array of dimension sizes
+ * @param strides_ptr Pointer to array of strides for each dimension
+ * @param storage_offset Storage offset in number of elements
+ * @param dtype Data type identifier (matches PyTorch scalar types)
+ * @param device_type Device type (CPU=0, CUDA=1)
+ * @param device_index Device index
+ * @param ret_new_tensor Output parameter for the created tensor
  * @param layout Tensor layout identifier (0=strided)
  * @param opaque_metadata Optional metadata pointer (can be null)
  * @param opaque_metadata_size Size of opaque metadata in bytes
- * @return AOTITorchError error code (Error::Ok on success, or an error code on
- * failure)
+ * @return AOTITorchError error code (Error::Ok on success)
  */
 AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     void* data,
@@ -53,24 +54,23 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     int32_t dtype,
     int32_t device_type,
     int32_t device_index,
-    Tensor** ret_new_tensor,
+    SlimTensor** ret_new_tensor,
     int32_t layout,
     const uint8_t* opaque_metadata,
     int64_t opaque_metadata_size);
 
 /**
  * Creates an uninitialized tensor with specified dimensions, strides, and
- * dtyper on either CPU or CUDA device.
+ * dtype on either CPU or CUDA device.
  *
  * @param ndim Number of dimensions in the tensor
  * @param sizes_ptr Pointer to array of dimension sizes
  * @param strides_ptr Pointer to array of strides for each dimension
  * @param dtype Data type identifier (matches PyTorch scalar types)
  * @param device_type Device type (0=CPU, 1=CUDA)
- * @param device_index Device index (must be 0 for current implementation)
+ * @param device_index Device index
  * @param ret_new_tensor Output parameter for the created tensor
- * @return AOTITorchError error code (Error::Ok on success, or an error code on
- * failure)
+ * @return AOTITorchError error code (Error::Ok on success)
  */
 AOTI_SHIM_EXPORT AOTITorchError aoti_torch_empty_strided(
     int64_t ndim,
@@ -79,129 +79,99 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_empty_strided(
     int32_t dtype,
     int32_t device_type,
     int32_t device_index,
-    Tensor** ret_new_tensor);
+    SlimTensor** ret_new_tensor);
 
 /**
- * Deletes a tensor object and frees its associated memory.
+ * Deletes a tensor object and frees associated resources.
  *
- * @param tensor Pointer to the tensor object to be deleted
- * @return AOTITorchError error code (Error::Ok on success, or an error code on
- * failure)
+ * For SlimTensor, the underlying storage uses SharedPtr-based reference
+ * counting. When the last tensor referencing the storage is deleted,
+ * the memory is automatically freed.
+ *
+ * @param tensor Pointer to the tensor to delete (must not be null)
+ * @return AOTITorchError error code (Error::Ok on success)
  */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_tensor_object(SlimTensor* tensor);
 
 /**
- * Creates a tensor view that reinterprets the same underlying memory with
- * different shape and strides without copying data.
+ * Creates a new tensor handle that shares storage with the original tensor.
  *
- * Note that the new tensor will not have the ownership of the underlying
- * memory.
+ * The new handle is a copy of the original tensor's metadata (sizes, strides,
+ * dtype, device) and shares the same underlying storage via SharedPtr.
+ * Both tensors will reference the same memory, and the memory will only be
+ * freed when all references are deleted.
  *
- * @param self Input tensor whose memory will be reinterpreted
- * @param ndim Number of dimensions for the new tensor view
- * @param sizes_ptr Array of sizes for each dimension
- * @param strides_ptr Array of strides for each dimension (or nullptr for
- * contiguous)
- * @param storage_offset Storage offset (must be 0)
- * @param ret_new_tensor Output pointer to store the new tensor view
+ * @param orig_handle Pointer to the original tensor (must not be null)
+ * @param new_handle Output parameter for the new tensor handle
+ * @return AOTITorchError error code (Error::Ok on success)
+ */
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_new_tensor_handle(SlimTensor* orig_handle, SlimTensor** new_handle);
+
+/**
+ * Creates a reinterpreted view of a tensor with new sizes, strides, and offset.
+ *
+ * This is equivalent to torch.as_strided() - it creates a new tensor that
+ * shares the same underlying storage but with different view parameters.
  *
- * @return Error::Ok on success, appropriate error code on failure
+ * @param self Original tensor to reinterpret (must not be null)
+ * @param ndim Number of dimensions for the new view
+ * @param sizes_ptr Pointer to array of dimension sizes
+ * @param strides_ptr Pointer to array of strides for each dimension
+ * @param storage_offset Storage offset in number of elements
+ * @param ret_new_tensor Output parameter for the reinterpreted tensor view
+ * @return AOTITorchError error code (Error::Ok on success)
  */
 AOTI_SHIM_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
-    Tensor* self,
+    SlimTensor* self,
     int64_t ndim,
     const int64_t* sizes_ptr,
     const int64_t* strides_ptr,
     int64_t storage_offset,
-    Tensor** ret_new_tensor);
+    SlimTensor** ret_new_tensor);
 
 /**
  * Copies data from source tensor to destination tensor.
  *
- * This function implements copy function for tensors living in CUDA AOTI
- * backend. It supports copying between tensors with different shapes (as long
- * as they have the same total number of elements) and different memory
- * layouts/strides.
- *
- * Note that currently this function does not support copying between tensors
- * with different dtypes.
- *
- * @param self Destination tensor (data will be overwritten)
- * @param src Source tensor (data will be copied from this tensor)
- * @param non_blocking Whether the copy should be non-blocking (currently
- * ignored)
- *
- * @return Error::Ok on success, appropriate error code on failure:
- *         - Error::InvalidArgument: null pointers, dtype mismatch, numel
- * mismatch
- *         - Error::MemoryAllocationFailed: failed to allocate temporary memory
- *         - Error::Internal: CUDA operation failures
- */
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
-
-/**
- * Creates a new tensor handle from an existing one.
- *
- * This function creates a new tensor object that shares the same underlying
- * memory as the original tensor. Similar to PyTorch's Tensor copy constructor,
- * it creates a new handle/reference to the same data without performing a deep
- * copy.
- *
- * The new tensor will:
- * - Share the same memory/storage as the original tensor
- * - Have the same shape, strides, and dtype as the original
- * - Increment the reference count for the underlying memory (if owned)
- *
- * @param orig_handle Original tensor to create a new handle from (must not be
- * null)
- * @param new_handle Output pointer to store the new tensor handle (must not be
- * null)
+ * Handles all device combinations (CPU-CPU, CPU-CUDA, CUDA-CPU, CUDA-CUDA)
+ * and supports tensors with different strides. The destination tensor must
+ * already be allocated with sufficient storage.
  *
- * @return Error::Ok on success, appropriate error code on failure:
- *         - Error::InvalidArgument: null pointers or invalid parameters
+ * @param self Destination tensor (must not be null)
+ * @param src Source tensor to copy from (must not be null)
+ * @param non_blocking If true, the copy may be asynchronous (currently ignored)
+ * @return AOTITorchError error code (Error::Ok on success)
  */
 AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle);
+aoti_torch_copy_(SlimTensor* self, SlimTensor* src, int32_t non_blocking);
 
 /**
- * Retrieves a boolean value from a 0D boolean tensor.
+ * Extracts a boolean scalar value from a single-element tensor.
  *
- * This function extracts the scalar boolean value from a tensor that contains
- * a single boolean element. The tensor can be on either CPU or CUDA device.
- * For CUDA tensors, the value is copied from device to host memory.
+ * The tensor must contain exactly one element and have Bool dtype.
+ * For CUDA tensors, this will synchronize to copy the value to CPU.
  *
- * @param tensor Pointer to a 0D boolean tensor (must not be null)
- * @param ret_value Output pointer to store the boolean value (must not be null)
- *
- * @return Error::Ok on success, appropriate error code on failure:
- *         - Error::InvalidArgument: null pointers or tensor dtype is not bool
+ * @param tensor Single-element boolean tensor (must not be null)
+ * @param ret_value Output parameter for the extracted boolean value
+ * @return AOTITorchError error code (Error::Ok on success)
  */
 AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_item_bool(Tensor* tensor, bool* ret_value);
+aoti_torch_item_bool(SlimTensor* tensor, bool* ret_value);
 
 /**
- * Creates a new tensor that shares the same underlying data as the source
- * tensor.
- *
- * This function creates a new tensor view with the same shape, strides, and
- * dtype as the source tensor, sharing the same underlying memory. The new
- * tensor handle will be stored in ret_dst.
+ * Moves a tensor into a new handle and assigns it to the output parameter.
  *
- * @param src The source tensor providing the data and metadata.
- * @param ret_dst On output, this will point to the new tensor view.
+ * Unlike aoti_torch_new_tensor_handle which copies, this function moves the
+ * source tensor into the destination. After this operation, the source tensor
+ * is left in an undefined/reset state and should not be used.
  *
- * @return Error::Ok on success, appropriate error code on failure:
- *         - Error::InvalidArgument: null pointers or memory not tracked
+ * @param src Source tensor to move from (must not be null, will be reset)
+ * @param ret_dst Output parameter for the new tensor handle
+ * @return AOTITorchError error code (Error::Ok on success)
  */
 AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst);
-
-// Function to clear all tensors from internal storage
-AOTI_SHIM_EXPORT void clear_all_tensors();
+aoti_torch_assign_tensors_out(SlimTensor* src, SlimTensor** ret_dst);
 
-// Function to clear memory tracking map (for test cleanup)
-AOTI_SHIM_EXPORT void clear_memory_tracking();
 } // extern "C"
 
 } // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory_slim.cpp b/backends/cuda/runtime/shims/memory_slim.cpp
deleted file mode 100644
index 58bf43b34b0..00000000000
--- a/backends/cuda/runtime/shims/memory_slim.cpp
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
-
-#include <executorch/backends/aoti/slim/factory/empty.h>
-#include <executorch/backends/aoti/slim/factory/from_blob.h>
-#include <executorch/backends/aoti/slim/util/array_ref_util.h>
-#include <executorch/runtime/platform/assert.h>
-
-namespace executorch::backends::cuda {
-
-namespace c10 = executorch::backends::aoti::slim::c10;
-using c10::Device;
-using c10::DeviceIndex;
-using c10::DeviceType;
-using c10::ScalarType;
-using executorch::backends::aoti::slim::empty_strided;
-using executorch::backends::aoti::slim::from_blob;
-using executorch::backends::aoti::slim::IntArrayRef;
-
-extern "C" {
-
-AOTITorchError aoti_torch_create_tensor_from_blob_v2(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    Tensor** ret_new_tensor,
-    int32_t layout,
-    const uint8_t* opaque_metadata,
-    int64_t opaque_metadata_size) {
-  // Unused parameters
-  (void)layout;
-  (void)opaque_metadata;
-  (void)opaque_metadata_size;
-
-  ET_CHECK_OR_RETURN_ERROR(
-      data != nullptr,
-      InvalidArgument,
-      "aoti_torch_create_tensor_from_blob_v2: data is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      ret_new_tensor != nullptr,
-      InvalidArgument,
-      "aoti_torch_create_tensor_from_blob_v2: ret_new_tensor is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      !(sizes_ptr == nullptr && ndim > 0),
-      InvalidArgument,
-      "aoti_torch_create_tensor_from_blob_v2: sizes_ptr is null but ndim > 0");
-
-  IntArrayRef sizes(sizes_ptr, static_cast<size_t>(ndim));
-  IntArrayRef strides(strides_ptr, static_cast<size_t>(ndim));
-
-  // Create the SlimTensor using from_blob (non-owning)
-  *ret_new_tensor = new Tensor(from_blob(
-      data,
-      sizes,
-      strides,
-      static_cast<ScalarType>(dtype),
-      Device(
-          static_cast<DeviceType>(device_type),
-          static_cast<DeviceIndex>(device_index)),
-      storage_offset));
-
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_empty_strided(
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    Tensor** ret_new_tensor) {
-  ET_CHECK_OR_RETURN_ERROR(
-      ret_new_tensor != nullptr,
-      InvalidArgument,
-      "aoti_torch_empty_strided: ret_new_tensor is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      !(sizes_ptr == nullptr && ndim > 0),
-      InvalidArgument,
-      "aoti_torch_empty_strided: sizes_ptr is null but ndim > 0");
-
-  IntArrayRef sizes(sizes_ptr, static_cast<size_t>(ndim));
-  IntArrayRef strides(strides_ptr, static_cast<size_t>(ndim));
-
-  // Create the SlimTensor using empty_strided (owning)
-  *ret_new_tensor = new Tensor(empty_strided(
-      sizes,
-      strides,
-      static_cast<ScalarType>(dtype),
-      Device(
-          static_cast<DeviceType>(device_type),
-          static_cast<DeviceIndex>(device_index))));
-
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
-  ET_CHECK_OR_RETURN_ERROR(
-      tensor != nullptr,
-      InvalidArgument,
-      "aoti_torch_delete_tensor_object: tensor is null");
-
-  // SlimTensor uses SharedPtr for storage, so simply deleting the tensor
-  // will automatically handle reference counting and free the underlying
-  // storage when no more references exist.
-  delete tensor;
-
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_new_tensor_handle(
-    Tensor* orig_handle,
-    Tensor** new_handle) {
-  ET_CHECK_OR_RETURN_ERROR(
-      orig_handle != nullptr,
-      InvalidArgument,
-      "aoti_torch_new_tensor_handle: orig_handle is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      new_handle != nullptr,
-      InvalidArgument,
-      "aoti_torch_new_tensor_handle: new_handle is null");
-
-  // Create a new SlimTensor that shares the same underlying storage.
-  // SlimTensor's copy constructor shares the SharedPtr<Storage>, so both
-  // tensors will reference the same memory. When the last tensor is deleted,
-  // the storage will be freed.
-  *new_handle = new Tensor(*orig_handle);
-
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch__reinterpret_tensor(
-    Tensor* self,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    Tensor** ret_new_tensor) {
-  ET_CHECK_OR_RETURN_ERROR(
-      self != nullptr,
-      InvalidArgument,
-      "aoti_torch__reinterpret_tensor: self is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      ret_new_tensor != nullptr,
-      InvalidArgument,
-      "aoti_torch__reinterpret_tensor: ret_new_tensor is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      ndim >= 0,
-      InvalidArgument,
-      "aoti_torch__reinterpret_tensor: ndim must be non-negative, got %lld",
-      static_cast<long long>(ndim));
-
-  ET_CHECK_OR_RETURN_ERROR(
-      !(sizes_ptr == nullptr && ndim > 0),
-      InvalidArgument,
-      "aoti_torch__reinterpret_tensor: sizes_ptr is null but ndim > 0");
-
-  IntArrayRef sizes(sizes_ptr, static_cast<size_t>(ndim));
-  IntArrayRef strides(strides_ptr, static_cast<size_t>(ndim));
-
-  // Create a new tensor view using as_strided. This creates a tensor that
-  // shares the same underlying storage but with different sizes, strides,
-  // and storage offset. SlimTensor::as_strided() handles this via copy
-  // constructor which shares the SharedPtr<Storage>.
-  *ret_new_tensor =
-      new Tensor(self->as_strided(sizes, strides, storage_offset));
-
-  return Error::Ok;
-}
-
-AOTITorchError
-aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) {
-  (void)non_blocking; // SlimTensor::copy_() is always synchronous for now
-
-  ET_CHECK_OR_RETURN_ERROR(
-      self != nullptr, InvalidArgument, "aoti_torch_copy_: self is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      src != nullptr, InvalidArgument, "aoti_torch_copy_: src is null");
-
-  // SlimTensor::copy_() handles:
-  // - Same numel validation
-  // - Same dtype validation
-  // - CPU-CPU, CPU-CUDA, CUDA-CPU, CUDA-CUDA copies
-  // - Contiguous fast path and non-contiguous element-wise copy
-  self->copy_(*src);
-
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_item_bool(Tensor* tensor, bool* ret_value) {
-  ET_CHECK_OR_RETURN_ERROR(
-      tensor != nullptr,
-      InvalidArgument,
-      "aoti_torch_item_bool: tensor is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      ret_value != nullptr,
-      InvalidArgument,
-      "aoti_torch_item_bool: ret_value is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      tensor->numel() == 1,
-      InvalidArgument,
-      "aoti_torch_item_bool: tensor must have exactly 1 element, got %zu",
-      tensor->numel());
-
-  ET_CHECK_OR_RETURN_ERROR(
-      tensor->dtype() == ScalarType::Bool,
-      InvalidArgument,
-      "aoti_torch_item_bool: tensor dtype must be Bool");
-
-  // SlimTensor::item<T>() handles both CPU and CUDA tensors.
-  // For CUDA tensors, it copies the value to CPU automatically.
-  *ret_value = tensor->item<bool>();
-
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst) {
-  ET_CHECK_OR_RETURN_ERROR(
-      src != nullptr,
-      InvalidArgument,
-      "aoti_torch_assign_tensors_out: src is null");
-
-  ET_CHECK_OR_RETURN_ERROR(
-      ret_dst != nullptr,
-      InvalidArgument,
-      "aoti_torch_assign_tensors_out: ret_dst is null");
-
-  // Move the source tensor into the destination. After this operation,
-  // the source tensor will be left in an undefined state (reset).
-  // This differs from aoti_torch_new_tensor_handle which copies the tensor.
-  *ret_dst = new Tensor(std::move(*src));
-
-  return Error::Ok;
-}
-
-} // extern "C"
-
-} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory_slim.h b/backends/cuda/runtime/shims/memory_slim.h
deleted file mode 100644
index 5a0845f243c..00000000000
--- a/backends/cuda/runtime/shims/memory_slim.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-
-#include <executorch/backends/aoti/export.h>
-#include <executorch/backends/aoti/slim/core/slim_tensor.h>
-#include <executorch/backends/aoti/slim/core/slim_tensor_view_incl.h>
-#include <executorch/runtime/core/error.h>
-
-namespace executorch::backends::cuda {
-
-using executorch::runtime::Error;
-using AOTITorchError = Error;
-using Tensor = executorch::backends::aoti::slim::SlimTensor;
-
-extern "C" {
-
-/**
- * Creates a tensor object from an existing memory blob without copying the
- * data. The tensor will wrap the provided memory and will not take ownership of
- * it. When the tensor is deleted, the original memory will remain valid and
- * must be freed by the caller.
- *
- * @param data Pointer to the memory blob to wrap (must not be null)
- * @param ndim Number of dimensions in the tensor
- * @param sizes_ptr Pointer to array of dimension sizes
- * @param strides_ptr Pointer to array of strides for each dimension
- * @param storage_offset Storage offset in number of elements
- * @param dtype Data type identifier (matches PyTorch scalar types)
- * @param device_type Device type (CPU=0, CUDA=1)
- * @param device_index Device index
- * @param ret_new_tensor Output parameter for the created tensor
- * @param layout Tensor layout identifier (0=strided)
- * @param opaque_metadata Optional metadata pointer (can be null)
- * @param opaque_metadata_size Size of opaque metadata in bytes
- * @return AOTITorchError error code (Error::Ok on success)
- */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    Tensor** ret_new_tensor,
-    int32_t layout,
-    const uint8_t* opaque_metadata,
-    int64_t opaque_metadata_size);
-
-/**
- * Creates an uninitialized tensor with specified dimensions, strides, and
- * dtype on either CPU or CUDA device.
- *
- * @param ndim Number of dimensions in the tensor
- * @param sizes_ptr Pointer to array of dimension sizes
- * @param strides_ptr Pointer to array of strides for each dimension
- * @param dtype Data type identifier (matches PyTorch scalar types)
- * @param device_type Device type (0=CPU, 1=CUDA)
- * @param device_index Device index
- * @param ret_new_tensor Output parameter for the created tensor
- * @return AOTITorchError error code (Error::Ok on success)
- */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_empty_strided(
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    Tensor** ret_new_tensor);
-
-/**
- * Deletes a tensor object and frees associated resources.
- *
- * For SlimTensor, the underlying storage uses SharedPtr-based reference
- * counting. When the last tensor referencing the storage is deleted,
- * the memory is automatically freed.
- *
- * @param tensor Pointer to the tensor to delete (must not be null)
- * @return AOTITorchError error code (Error::Ok on success)
- */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
-
-/**
- * Creates a new tensor handle that shares storage with the original tensor.
- *
- * The new handle is a copy of the original tensor's metadata (sizes, strides,
- * dtype, device) and shares the same underlying storage via SharedPtr.
- * Both tensors will reference the same memory, and the memory will only be
- * freed when all references are deleted.
- *
- * @param orig_handle Pointer to the original tensor (must not be null)
- * @param new_handle Output parameter for the new tensor handle
- * @return AOTITorchError error code (Error::Ok on success)
- */
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle);
-
-/**
- * Creates a reinterpreted view of a tensor with new sizes, strides, and offset.
- *
- * This is equivalent to torch.as_strided() - it creates a new tensor that
- * shares the same underlying storage but with different view parameters.
- *
- * @param self Original tensor to reinterpret (must not be null)
- * @param ndim Number of dimensions for the new view
- * @param sizes_ptr Pointer to array of dimension sizes
- * @param strides_ptr Pointer to array of strides for each dimension
- * @param storage_offset Storage offset in number of elements
- * @param ret_new_tensor Output parameter for the reinterpreted tensor view
- * @return AOTITorchError error code (Error::Ok on success)
- */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
-    Tensor* self,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    Tensor** ret_new_tensor);
-
-/**
- * Copies data from source tensor to destination tensor.
- *
- * Handles all device combinations (CPU-CPU, CPU-CUDA, CUDA-CPU, CUDA-CUDA)
- * and supports tensors with different strides. The destination tensor must
- * already be allocated with sufficient storage.
- *
- * @param self Destination tensor (must not be null)
- * @param src Source tensor to copy from (must not be null)
- * @param non_blocking If true, the copy may be asynchronous (currently ignored)
- * @return AOTITorchError error code (Error::Ok on success)
- */
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
-
-/**
- * Extracts a boolean scalar value from a single-element tensor.
- *
- * The tensor must contain exactly one element and have Bool dtype.
- * For CUDA tensors, this will synchronize to copy the value to CPU.
- *
- * @param tensor Single-element boolean tensor (must not be null)
- * @param ret_value Output parameter for the extracted boolean value
- * @return AOTITorchError error code (Error::Ok on success)
- */
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_item_bool(Tensor* tensor, bool* ret_value);
-
-/**
- * Moves a tensor into a new handle and assigns it to the output parameter.
- *
- * Unlike aoti_torch_new_tensor_handle which copies, this function moves the
- * source tensor into the destination. After this operation, the source tensor
- * is left in an undefined/reset state and should not be used.
- *
- * @param src Source tensor to move from (must not be null, will be reset)
- * @param ret_dst Output parameter for the new tensor handle
- * @return AOTITorchError error code (Error::Ok on success)
- */
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst);
-
-} // extern "C"
-
-} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tests/CMakeLists.txt b/backends/cuda/runtime/shims/tests/CMakeLists.txt
index 204c08688c4..291e3052bbd 100644
--- a/backends/cuda/runtime/shims/tests/CMakeLists.txt
+++ b/backends/cuda/runtime/shims/tests/CMakeLists.txt
@@ -35,7 +35,7 @@ endif()
 # Find installed ExecuTorch
 find_package(executorch CONFIG REQUIRED HINTS ${CMAKE_INSTALL_PREFIX})
 
-# List of test files
+# List of SlimTensor-based test files (now the primary tests)
 set(CUDA_SHIM_TESTS
     test_aoti_torch_create_tensor_from_blob_v2
     test_aoti_torch_empty_strided
@@ -49,6 +49,7 @@ set(CUDA_SHIM_TESTS
 
 enable_testing()
 
+# Build SlimTensor-based tests
 foreach(test_name ${CUDA_SHIM_TESTS})
   add_executable(${test_name} ${test_name}.cpp)
 
@@ -57,16 +58,15 @@ foreach(test_name ${CUDA_SHIM_TESTS})
                          ${CUDAToolkit_INCLUDE_DIRS}
   )
 
+  target_compile_definitions(${test_name} PRIVATE CUDA_AVAILABLE=1)
+
   target_link_libraries(
     ${test_name}
     PRIVATE GTest::gtest
             GTest::gtest_main
             aoti_cuda_shims
-            aoti_cuda_backend
-            cuda_tensor_maker
-            cuda_platform
+            slimtensor
             executorch_core
-            extension_tensor
             CUDA::cudart
   )
 
diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl
index a6b18eba4c8..04f7aa2f963 100644
--- a/backends/cuda/runtime/shims/tests/targets.bzl
+++ b/backends/cuda/runtime/shims/tests/targets.bzl
@@ -3,35 +3,12 @@ load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
 load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
 
 def cuda_shim_cpp_unittest(name):
+    """Unittest for SlimTensor-based shim functions."""
     cpp_unittest(
         name = "test_" + name,
         srcs = [
             "test_" + name + ".cpp",
         ],
-        deps = [
-            "//executorch/backends/aoti:common_shims",
-            "//executorch/backends/cuda/runtime:runtime_shims",
-            "//executorch/extension/tensor:tensor",
-            "//executorch/runtime/core:core",
-            "//executorch/runtime/platform:platform",
-            "//executorch/runtime/core/exec_aten:lib",
-        ],
-        external_deps = [
-            ("cuda", None, "cuda-lazy"),
-        ],
-        keep_gpu_sections = True,
-        remote_execution = re_test_utils.remote_execution(
-            platform = "gpu-remote-execution",
-        ),
-    )
-
-def cuda_shim_slim_cpp_unittest(name):
-    """Unittest for SlimTensor-based shim functions."""
-    cpp_unittest(
-        name = "test_" + name + "_slim",
-        srcs = [
-            "test_" + name + "_slim.cpp",
-        ],
         deps = [
             "//executorch/backends/cuda/runtime:runtime_shims_slim",
             "//executorch/backends/aoti:common_shims",
@@ -58,24 +35,12 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
-    # Original ETensor-based shim tests, will be removed after migration
+    # SlimTensor-based shim tests (now the primary tests)
     cuda_shim_cpp_unittest("aoti_torch_empty_strided")
-    cuda_shim_cpp_unittest("aoti_torch_delete_tensor_object")
     cuda_shim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2")
+    cuda_shim_cpp_unittest("aoti_torch_delete_tensor_object")
+    cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle")
     cuda_shim_cpp_unittest("aoti_torch__reinterpret_tensor")
     cuda_shim_cpp_unittest("aoti_torch_copy_")
-    cuda_shim_cpp_unittest("aoti_torch_cuda_guard")
-    cuda_shim_cpp_unittest("aoti_torch_cuda__weight_int4pack_mm")
-    cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle")
     cuda_shim_cpp_unittest("aoti_torch_item_bool")
     cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out")
-
-    # SlimTensor-based shim tests
-    cuda_shim_slim_cpp_unittest("aoti_torch_empty_strided")
-    cuda_shim_slim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2")
-    cuda_shim_slim_cpp_unittest("aoti_torch_delete_tensor_object")
-    cuda_shim_slim_cpp_unittest("aoti_torch_new_tensor_handle")
-    cuda_shim_slim_cpp_unittest("aoti_torch__reinterpret_tensor")
-    cuda_shim_slim_cpp_unittest("aoti_torch_copy_")
-    cuda_shim_slim_cpp_unittest("aoti_torch_item_bool")
-    cuda_shim_slim_cpp_unittest("aoti_torch_assign_tensors_out")
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
index d3044810b15..d2ad645136e 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
@@ -7,806 +7,686 @@
  */
 
 #include <cuda_runtime.h>
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/aoti/utils.h>
-#include <executorch/backends/cuda/runtime/shims/memory.h>
-#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/utils.h>
-#include <executorch/extension/tensor/tensor_ptr_maker.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
 #include <vector>
 
-using namespace executorch::backends::aoti;
-using namespace executorch::backends::cuda;
-using namespace executorch::runtime;
-using executorch::runtime::etensor::Tensor;
-
-// Test fixture for aoti_torch__reinterpret_tensor tests
-class AOTITorchReinterpretTensorTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Initialize ExecuTorch Platform Abstraction Layer
-    et_pal_init();
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
+#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
 
-    // Check if CUDA is available
-    int device_count = 0;
-    cudaError_t err = cudaGetDeviceCount(&device_count);
-    if (err != cudaSuccess || device_count == 0) {
-      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
-    }
+using namespace executorch::backends::cuda;
+using executorch::runtime::Error;
 
-    // Clean up any existing cached metadata before each test
-    cleanup_tensor_metadata();
+namespace slim_c10 = executorch::backends::aoti::slim::c10;
 
-    // Clear any remaining tensors from previous tests
-    clear_all_tensors();
-  }
+namespace {
 
-  void TearDown() override {
-    // Clean up metadata
-    cleanup_tensor_metadata();
+bool isCudaAvailable() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  return (err == cudaSuccess && device_count > 0);
+}
 
-    // Clear the global tensor storage using the provided function
-    clear_all_tensors();
+std::vector<int64_t> calculateContiguousStrides(
+    const std::vector<int64_t>& sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  if (sizes.empty()) {
+    return strides;
   }
-
-  // Helper to calculate number of elements from sizes
-  int64_t calculate_numel(const std::vector<int64_t>& sizes) {
-    int64_t numel = 1;
-    for (int64_t size : sizes) {
-      numel *= size;
-    }
-    return numel;
+  strides[sizes.size() - 1] = 1;
+  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * sizes[i + 1];
   }
+  return strides;
+}
 
-  // Helper to calculate contiguous strides from sizes
-  std::vector<int64_t> calculate_contiguous_strides(
-      const std::vector<int64_t>& sizes) {
-    std::vector<int64_t> strides(sizes.size());
-    if (sizes.empty()) {
-      return strides;
-    }
+} // namespace
 
-    strides[sizes.size() - 1] = 1;
-    for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-      strides[i] = strides[i + 1] * sizes[i + 1];
-    }
-    return strides;
+class AOTITorchReinterpretTensorSlimTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
   }
 
-  // Helper to create a source tensor using empty_strided (which allocates new
-  // memory)
-  Tensor* create_source_tensor(
+  Tensor* createTestTensor(
       const std::vector<int64_t>& sizes,
-      int32_t dtype = 6, // float32
-      int32_t device_type = 1, // CUDA
+      const std::vector<int64_t>& strides = {},
+      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
+      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
       int32_t device_index = 0) {
-    std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+    Tensor* tensor = nullptr;
+
+    std::vector<int64_t> effective_strides = strides;
+    if (strides.empty()) {
+      effective_strides = calculateContiguousStrides(sizes);
+    }
 
-    Tensor* tensor;
     AOTITorchError error = aoti_torch_empty_strided(
         sizes.size(),
         sizes.data(),
-        strides.data(),
+        effective_strides.data(),
         dtype,
         device_type,
         device_index,
         &tensor);
 
-    if (error != Error::Ok) {
-      return nullptr;
-    }
-
-    return tensor;
+    return (error == Error::Ok) ? tensor : nullptr;
   }
-
- private:
-  std::vector<void*> cuda_memory_buffers_;
-  std::vector<void*> cpu_memory_buffers_;
 };
 
-// Test basic functionality: reinterpret tensor with different shapes
-TEST_F(AOTITorchReinterpretTensorTest, BasicReinterpretation) {
-  // Create a source tensor with shape [12] (1D with 12 elements)
-  std::vector<int64_t> source_sizes = {12};
-  Tensor* source_tensor = create_source_tensor(source_sizes);
-  ASSERT_NE(source_tensor, nullptr);
-
-  // Store the original data pointer
-  void* original_data_ptr = source_tensor->mutable_data_ptr();
-  ASSERT_NE(original_data_ptr, nullptr);
+// ============================================================================
+// Basic Functionality Tests
+// ============================================================================
+
+TEST_F(AOTITorchReinterpretTensorSlimTest, BasicView_CPU) {
+  std::vector<int64_t> sizes = {2, 3, 4};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
 
-  // Reinterpret as [3, 4] (2D with same number of elements)
-  std::vector<int64_t> new_sizes = {3, 4};
-  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+  std::vector<int64_t> new_sizes = {6, 4};
+  std::vector<int64_t> new_strides = {4, 1};
+  int64_t storage_offset = 0;
 
-  Tensor* reinterpreted_tensor;
+  Tensor* view_tensor = nullptr;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
+      orig_tensor,
       new_sizes.size(),
       new_sizes.data(),
       new_strides.data(),
-      0, // storage_offset
-      &reinterpreted_tensor);
+      storage_offset,
+      &view_tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(reinterpreted_tensor, nullptr);
-
-  // Check that the reinterpreted tensor has the new shape
-  EXPECT_EQ(reinterpreted_tensor->dim(), 2);
-  EXPECT_EQ(reinterpreted_tensor->size(0), 3);
-  EXPECT_EQ(reinterpreted_tensor->size(1), 4);
-
-  // CRITICAL: Check that the reinterpreted tensor uses the SAME memory
-  void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr();
-  EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr)
-      << "Reinterpreted tensor should use the same memory as the source tensor";
-
-  // Write data through the original tensor and verify it's visible through the
-  // reinterpreted tensor
-  std::vector<float> test_data = {
-      1.0f,
-      2.0f,
-      3.0f,
-      4.0f,
-      5.0f,
-      6.0f,
-      7.0f,
-      8.0f,
-      9.0f,
-      10.0f,
-      11.0f,
-      12.0f};
-  cudaError_t cuda_err = cudaMemcpy(
-      original_data_ptr,
-      test_data.data(),
-      test_data.size() * sizeof(float),
-      cudaMemcpyHostToDevice);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-
-  // Read back through the reinterpreted tensor
-  std::vector<float> readback_data(12);
-  cuda_err = cudaMemcpy(
-      readback_data.data(),
-      reinterpreted_data_ptr,
-      readback_data.size() * sizeof(float),
-      cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-
-  // Verify the data matches
-  for (size_t i = 0; i < test_data.size(); i++) {
-    EXPECT_EQ(readback_data[i], test_data[i])
-        << "Data should be the same through both tensors at index " << i;
-  }
-}
-
-// Test reinterpreting with different strides
-TEST_F(AOTITorchReinterpretTensorTest, ReinterpretWithCustomStrides) {
-  // Create a source tensor with shape [2, 6] (contiguous)
-  std::vector<int64_t> source_sizes = {2, 6};
-  Tensor* source_tensor = create_source_tensor(source_sizes);
-  ASSERT_NE(source_tensor, nullptr);
-
-  void* original_data_ptr = source_tensor->mutable_data_ptr();
-  ASSERT_NE(original_data_ptr, nullptr);
+  ASSERT_NE(view_tensor, nullptr);
 
-  // Reinterpret as [3, 4] with custom strides (still valid for the same memory)
-  std::vector<int64_t> new_sizes = {3, 4};
-  std::vector<int64_t> new_strides = {4, 1}; // Row-major strides for [3, 4]
+  EXPECT_EQ(view_tensor->dim(), 2);
+  EXPECT_EQ(view_tensor->size(0), 6);
+  EXPECT_EQ(view_tensor->size(1), 4);
+  EXPECT_EQ(view_tensor->stride(0), 4);
+  EXPECT_EQ(view_tensor->stride(1), 1);
 
-  Tensor* reinterpreted_tensor;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      0, // storage_offset
-      &reinterpreted_tensor);
+  EXPECT_EQ(view_tensor->data_ptr(), orig_tensor->data_ptr());
 
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(reinterpreted_tensor, nullptr);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
+}
 
-  // Check shape
-  EXPECT_EQ(reinterpreted_tensor->dim(), 2);
-  EXPECT_EQ(reinterpreted_tensor->size(0), 3);
-  EXPECT_EQ(reinterpreted_tensor->size(1), 4);
+TEST_F(AOTITorchReinterpretTensorSlimTest, NullSelf) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = {3, 1};
 
-  // CRITICAL: Check that the reinterpreted tensor uses the SAME memory
-  void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr();
-  EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr)
-      << "Reinterpreted tensor should use the same memory as the source tensor";
+  Tensor* view_tensor = nullptr;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      nullptr, sizes.size(), sizes.data(), strides.data(), 0, &view_tensor);
 
-  // Verify strides were set correctly
-  int64_t* tensor_strides;
-  error = aoti_torch_get_strides(reinterpreted_tensor, &tensor_strides);
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(tensor_strides[0], 4);
-  EXPECT_EQ(tensor_strides[1], 1);
+  EXPECT_EQ(error, Error::InvalidArgument);
 }
 
-// Test error cases: null input tensor
-TEST_F(AOTITorchReinterpretTensorTest, NullInputTensor) {
-  std::vector<int64_t> new_sizes = {2, 3};
-  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+TEST_F(AOTITorchReinterpretTensorSlimTest, NullReturnPointer) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
+
+  std::vector<int64_t> new_sizes = {6};
+  std::vector<int64_t> new_strides = {1};
 
-  Tensor* reinterpreted_tensor;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      nullptr, // null input tensor
+      orig_tensor,
       new_sizes.size(),
       new_sizes.data(),
       new_strides.data(),
-      0, // storage_offset
-      &reinterpreted_tensor);
+      0,
+      nullptr);
 
   EXPECT_EQ(error, Error::InvalidArgument);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
 }
 
-// Test error cases: null sizes pointer
-TEST_F(AOTITorchReinterpretTensorTest, NullSizesPointer) {
-  std::vector<int64_t> source_sizes = {6};
-  Tensor* source_tensor = create_source_tensor(source_sizes);
-  ASSERT_NE(source_tensor, nullptr);
+TEST_F(AOTITorchReinterpretTensorSlimTest, NegativeNdim) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
 
-  std::vector<int64_t> new_strides = {2, 1};
+  std::vector<int64_t> new_sizes = {6};
+  std::vector<int64_t> new_strides = {1};
 
-  Tensor* reinterpreted_tensor;
+  Tensor* view_tensor = nullptr;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
-      2, // ndim > 0
-      nullptr, // null sizes pointer
-      new_strides.data(),
-      0, // storage_offset
-      &reinterpreted_tensor);
+      orig_tensor, -1, new_sizes.data(), new_strides.data(), 0, &view_tensor);
 
   EXPECT_EQ(error, Error::InvalidArgument);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
 }
 
-// Test error cases: null return tensor pointer
-TEST_F(AOTITorchReinterpretTensorTest, NullReturnTensorPointer) {
-  std::vector<int64_t> source_sizes = {6};
-  Tensor* source_tensor = create_source_tensor(source_sizes);
-  ASSERT_NE(source_tensor, nullptr);
+// ============================================================================
+// Storage Offset Tests
+// ============================================================================
+
+TEST_F(AOTITorchReinterpretTensorSlimTest, WithStorageOffset_CPU) {
+  std::vector<int64_t> sizes = {4, 4};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
 
-  std::vector<int64_t> new_sizes = {2, 3};
-  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+  std::vector<int64_t> new_sizes = {2, 4};
+  std::vector<int64_t> new_strides = {4, 1};
+  int64_t storage_offset = 4; // Skip first row
 
+  Tensor* view_tensor = nullptr;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
+      orig_tensor,
       new_sizes.size(),
       new_sizes.data(),
       new_strides.data(),
-      0, // storage_offset
-      nullptr); // null return tensor pointer
+      storage_offset,
+      &view_tensor);
 
-  EXPECT_EQ(error, Error::InvalidArgument);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(view_tensor, nullptr);
+
+  EXPECT_EQ(view_tensor->dim(), 2);
+  EXPECT_EQ(view_tensor->size(0), 2);
+  EXPECT_EQ(view_tensor->size(1), 4);
+
+  char* orig_ptr = static_cast<char*>(orig_tensor->data_ptr());
+  char* view_ptr = static_cast<char*>(view_tensor->data_ptr());
+  EXPECT_EQ(view_ptr, orig_ptr + storage_offset * sizeof(float));
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
 }
 
-// Test error cases: non-zero storage offset (should fail)
-TEST_F(AOTITorchReinterpretTensorTest, NonZeroStorageOffset) {
-  std::vector<int64_t> source_sizes = {6};
-  Tensor* source_tensor = create_source_tensor(source_sizes);
-  ASSERT_NE(source_tensor, nullptr);
+// ============================================================================
+// Memory Sharing Tests
+// ============================================================================
+
+TEST_F(AOTITorchReinterpretTensorSlimTest, MemorySharing_CPU) {
+  std::vector<int64_t> sizes = {6};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
+
+  void* orig_ptr = orig_tensor->data_ptr();
 
   std::vector<int64_t> new_sizes = {2, 3};
-  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+  std::vector<int64_t> new_strides = {3, 1};
 
-  Tensor* reinterpreted_tensor;
+  Tensor* view_tensor = nullptr;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
+      orig_tensor,
       new_sizes.size(),
       new_sizes.data(),
       new_strides.data(),
-      1, // non-zero storage_offset (should fail)
-      &reinterpreted_tensor);
+      0,
+      &view_tensor);
 
-  EXPECT_EQ(error, Error::InvalidArgument);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(view_tensor, nullptr);
+
+  EXPECT_EQ(view_tensor->data_ptr(), orig_ptr);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+
+  EXPECT_EQ(view_tensor->data_ptr(), orig_ptr);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
 }
 
-// Test reinterpreting CPU tensor
-TEST_F(AOTITorchReinterpretTensorTest, ReinterpretCPUTensor) {
-  // Create a CPU tensor with shape [8]
-  std::vector<int64_t> source_sizes = {8};
-  Tensor* source_tensor = create_source_tensor(
-      source_sizes,
-      6, // float32
-      0, // CPU device
+TEST_F(AOTITorchReinterpretTensorSlimTest, MultipleViews_CPU) {
+  std::vector<int64_t> sizes = {24};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
       0);
-  ASSERT_NE(source_tensor, nullptr);
+  ASSERT_NE(orig_tensor, nullptr);
 
-  void* original_data_ptr = source_tensor->mutable_data_ptr();
-  ASSERT_NE(original_data_ptr, nullptr);
+  void* orig_ptr = orig_tensor->data_ptr();
 
-  // Reinterpret as [2, 4]
-  std::vector<int64_t> new_sizes = {2, 4};
-  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+  std::vector<int64_t> sizes1 = {2, 12};
+  std::vector<int64_t> strides1 = {12, 1};
+
+  std::vector<int64_t> sizes2 = {4, 6};
+  std::vector<int64_t> strides2 = {6, 1};
+
+  std::vector<int64_t> sizes3 = {2, 3, 4};
+  std::vector<int64_t> strides3 = {12, 4, 1};
+
+  Tensor* view1 = nullptr;
+  Tensor* view2 = nullptr;
+  Tensor* view3 = nullptr;
+
+  EXPECT_EQ(
+      aoti_torch__reinterpret_tensor(
+          orig_tensor,
+          sizes1.size(),
+          sizes1.data(),
+          strides1.data(),
+          0,
+          &view1),
+      Error::Ok);
+  EXPECT_EQ(
+      aoti_torch__reinterpret_tensor(
+          orig_tensor,
+          sizes2.size(),
+          sizes2.data(),
+          strides2.data(),
+          0,
+          &view2),
+      Error::Ok);
+  EXPECT_EQ(
+      aoti_torch__reinterpret_tensor(
+          orig_tensor,
+          sizes3.size(),
+          sizes3.data(),
+          strides3.data(),
+          0,
+          &view3),
+      Error::Ok);
+
+  EXPECT_EQ(view1->data_ptr(), orig_ptr);
+  EXPECT_EQ(view2->data_ptr(), orig_ptr);
+  EXPECT_EQ(view3->data_ptr(), orig_ptr);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+
+  EXPECT_EQ(view1->data_ptr(), orig_ptr);
+  EXPECT_EQ(view2->data_ptr(), orig_ptr);
+  EXPECT_EQ(view3->data_ptr(), orig_ptr);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view1), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view2), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view3), Error::Ok);
+}
+
+// ============================================================================
+// Dimension Change Tests
+// ============================================================================
+
+TEST_F(AOTITorchReinterpretTensorSlimTest, ExpandDimensions_CPU) {
+  std::vector<int64_t> sizes = {6};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
+  EXPECT_EQ(orig_tensor->dim(), 1);
+
+  std::vector<int64_t> new_sizes = {2, 3};
+  std::vector<int64_t> new_strides = {3, 1};
 
-  Tensor* reinterpreted_tensor;
+  Tensor* view_tensor = nullptr;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
+      orig_tensor,
       new_sizes.size(),
       new_sizes.data(),
       new_strides.data(),
-      0, // storage_offset
-      &reinterpreted_tensor);
+      0,
+      &view_tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(reinterpreted_tensor, nullptr);
-
-  // Check that the reinterpreted tensor uses the SAME memory
-  void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr();
-  EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr)
-      << "Reinterpreted CPU tensor should use the same memory as the source tensor";
-
-  // Test direct memory access for CPU tensors
-  float* original_float_ptr = reinterpret_cast<float*>(original_data_ptr);
-  float* reinterpreted_float_ptr =
-      reinterpret_cast<float*>(reinterpreted_data_ptr);
-
-  // Write through original and read through reinterpreted
-  original_float_ptr[0] = 42.0f;
-  EXPECT_EQ(reinterpreted_float_ptr[0], 42.0f)
-      << "Changes through original tensor should be visible through reinterpreted tensor";
-}
+  ASSERT_NE(view_tensor, nullptr);
+  EXPECT_EQ(view_tensor->dim(), 2);
 
-// Test that deleting source tensor doesn't affect reinterpreted tensor (they
-// share memory)
-TEST_F(AOTITorchReinterpretTensorTest, DeletionBehavior) {
-  std::vector<int64_t> source_sizes = {6};
-  Tensor* source_tensor = create_source_tensor(source_sizes);
-  ASSERT_NE(source_tensor, nullptr);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
+}
 
-  void* shared_data_ptr = source_tensor->mutable_data_ptr();
+TEST_F(AOTITorchReinterpretTensorSlimTest, CollapseDimensions_CPU) {
+  std::vector<int64_t> sizes = {2, 3, 4};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
+  EXPECT_EQ(orig_tensor->dim(), 3);
 
-  // Reinterpret as [2, 3]
-  std::vector<int64_t> new_sizes = {2, 3};
-  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+  std::vector<int64_t> new_sizes = {24};
+  std::vector<int64_t> new_strides = {1};
 
-  Tensor* reinterpreted_tensor;
+  Tensor* view_tensor = nullptr;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
+      orig_tensor,
       new_sizes.size(),
       new_sizes.data(),
       new_strides.data(),
       0,
-      &reinterpreted_tensor);
+      &view_tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(reinterpreted_tensor, nullptr);
+  ASSERT_NE(view_tensor, nullptr);
+  EXPECT_EQ(view_tensor->dim(), 1);
+  EXPECT_EQ(view_tensor->numel(), 24);
 
-  // Verify they share the same memory
-  EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), shared_data_ptr);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
+}
 
-  // Delete the source tensor (which owns the memory)
-  error = aoti_torch_delete_tensor_object(source_tensor);
-  EXPECT_EQ(error, Error::Ok);
+TEST_F(AOTITorchReinterpretTensorSlimTest, ScalarTensorView_CPU) {
+  std::vector<int64_t> sizes = {1};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
+
+  std::vector<int64_t> new_sizes = {};
+  std::vector<int64_t> new_strides = {};
 
-  // The reinterpreted tensor should still be valid but the memory might be
-  // freed Since the source tensor owned the memory, the reinterpreted tensor
-  // becomes invalid This is expected behavior - the user needs to manage the
-  // lifecycle properly
+  Tensor* view_tensor = nullptr;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      orig_tensor, 0, new_sizes.data(), new_strides.data(), 0, &view_tensor);
 
-  // Clean up the reinterpreted tensor
-  error = aoti_torch_delete_tensor_object(reinterpreted_tensor);
   EXPECT_EQ(error, Error::Ok);
-}
+  ASSERT_NE(view_tensor, nullptr);
+  EXPECT_EQ(view_tensor->dim(), 0);
+  EXPECT_EQ(view_tensor->numel(), 1);
 
-// Test scalar tensor reinterpretation
-TEST_F(AOTITorchReinterpretTensorTest, ReinterpretScalarTensor) {
-  // Create a scalar tensor (0D)
-  std::vector<int64_t> source_sizes = {};
-  Tensor* source_tensor = create_source_tensor(source_sizes);
-  ASSERT_NE(source_tensor, nullptr);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
+}
 
-  void* original_data_ptr = source_tensor->mutable_data_ptr();
+// ============================================================================
+// Stride Tests
+// ============================================================================
+
+TEST_F(AOTITorchReinterpretTensorSlimTest, TransposeViaStrides_CPU) {
+  std::vector<int64_t> sizes = {3, 4};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
 
-  // Try to reinterpret scalar as [1] (1D with 1 element)
-  std::vector<int64_t> new_sizes = {1};
-  std::vector<int64_t> new_strides = {1};
+  std::vector<int64_t> new_sizes = {4, 3};
+  std::vector<int64_t> new_strides = {1, 4};
 
-  Tensor* reinterpreted_tensor;
+  Tensor* view_tensor = nullptr;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
+      orig_tensor,
       new_sizes.size(),
       new_sizes.data(),
       new_strides.data(),
       0,
-      &reinterpreted_tensor);
+      &view_tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(reinterpreted_tensor, nullptr);
-
-  // Check that the reinterpreted tensor uses the SAME memory
-  EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr);
+  ASSERT_NE(view_tensor, nullptr);
+  EXPECT_EQ(view_tensor->size(0), 4);
+  EXPECT_EQ(view_tensor->size(1), 3);
+  EXPECT_EQ(view_tensor->stride(0), 1);
+  EXPECT_EQ(view_tensor->stride(1), 4);
 
-  // Check new shape
-  EXPECT_EQ(reinterpreted_tensor->dim(), 1);
-  EXPECT_EQ(reinterpreted_tensor->size(0), 1);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
 }
 
-// Test reinterpreting tensor with zero-sized dimension
-// TODO: This test is disabled because zero-sized tensors have complex stride
-// validation requirements that need further investigation
-TEST_F(AOTITorchReinterpretTensorTest, DISABLED_ReinterpretZeroSizedTensor) {
-  // Create a tensor with shape [0, 5] (zero elements)
-  std::vector<int64_t> source_sizes = {0, 5};
-  Tensor* source_tensor = create_source_tensor(source_sizes);
-  ASSERT_NE(source_tensor, nullptr);
-
-  void* original_data_ptr = source_tensor->mutable_data_ptr();
+// ============================================================================
+// Different Dtype Tests
+// ============================================================================
+
+TEST_F(AOTITorchReinterpretTensorSlimTest, Int64Tensor_CPU) {
+  std::vector<int64_t> sizes = {6};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Long),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
 
-  // Reinterpret as [5, 0] (still zero elements)
-  std::vector<int64_t> new_sizes = {5, 0};
-  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+  std::vector<int64_t> new_sizes = {2, 3};
+  std::vector<int64_t> new_strides = {3, 1};
 
-  Tensor* reinterpreted_tensor;
+  Tensor* view_tensor = nullptr;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
+      orig_tensor,
       new_sizes.size(),
       new_sizes.data(),
       new_strides.data(),
       0,
-      &reinterpreted_tensor);
+      &view_tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(reinterpreted_tensor, nullptr);
-
-  // Check that the reinterpreted tensor uses the SAME memory
-  EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr);
+  ASSERT_NE(view_tensor, nullptr);
+  EXPECT_EQ(view_tensor->itemsize(), 8);
 
-  // Check new shape
-  EXPECT_EQ(reinterpreted_tensor->dim(), 2);
-  EXPECT_EQ(reinterpreted_tensor->size(0), 5);
-  EXPECT_EQ(reinterpreted_tensor->size(1), 0);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
 }
 
-// Test with nullptr strides (should use contiguous strides)
-TEST_F(AOTITorchReinterpretTensorTest, NullStridesPointer) {
-  std::vector<int64_t> source_sizes = {12};
-  Tensor* source_tensor = create_source_tensor(source_sizes);
-  ASSERT_NE(source_tensor, nullptr);
-
-  void* original_data_ptr = source_tensor->mutable_data_ptr();
+TEST_F(AOTITorchReinterpretTensorSlimTest, BFloat16Tensor_CPU) {
+  std::vector<int64_t> sizes = {6};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::BFloat16),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
 
-  // Reinterpret as [3, 4] with null strides (should calculate contiguous
-  // strides)
-  std::vector<int64_t> new_sizes = {3, 4};
+  std::vector<int64_t> new_sizes = {2, 3};
+  std::vector<int64_t> new_strides = {3, 1};
 
-  Tensor* reinterpreted_tensor;
+  Tensor* view_tensor = nullptr;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
+      orig_tensor,
       new_sizes.size(),
       new_sizes.data(),
-      nullptr, // null strides - should calculate contiguous strides
+      new_strides.data(),
       0,
-      &reinterpreted_tensor);
+      &view_tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(reinterpreted_tensor, nullptr);
-
-  // Check that the reinterpreted tensor uses the SAME memory
-  EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr);
+  ASSERT_NE(view_tensor, nullptr);
+  EXPECT_EQ(view_tensor->itemsize(), 2);
 
-  // Check that contiguous strides were calculated correctly
-  int64_t* tensor_strides;
-  error = aoti_torch_get_strides(reinterpreted_tensor, &tensor_strides);
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(tensor_strides[0], 4); // stride for dimension 0 should be 4
-  EXPECT_EQ(tensor_strides[1], 1); // stride for dimension 1 should be 1
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
 }
 
-// Test bf16 tensor reinterpretation
-TEST_F(AOTITorchReinterpretTensorTest, ReinterpretBF16Tensor) {
-  // Create a bf16 source tensor with shape [6]
-  std::vector<int64_t> source_sizes = {6};
-  Tensor* source_tensor = create_source_tensor(
-      source_sizes,
-      static_cast<int32_t>(
-          SupportedDTypes::BFLOAT16), // bf16 dtype from SupportedDTypes
-      static_cast<int32_t>(
-          SupportedDevices::CUDA), // CUDA device from SupportedDevices
-      0); // device_index must be 0
-  ASSERT_NE(source_tensor, nullptr);
-
-  void* original_data_ptr = source_tensor->mutable_data_ptr();
-  ASSERT_NE(original_data_ptr, nullptr);
-
-  // Verify the tensor is actually bf16
-  int32_t actual_dtype = 0;
-  AOTITorchError dtype_check_error =
-      aoti_torch_get_dtype(source_tensor, &actual_dtype);
-  EXPECT_EQ(dtype_check_error, Error::Ok);
-  EXPECT_EQ(actual_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
-      << "Source tensor should have bfloat16 dtype";
-
-  // Reinterpret as [2, 3] (same number of elements)
-  std::vector<int64_t> new_sizes = {2, 3};
-  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+// ============================================================================
+// CUDA Tests
+// ============================================================================
+
+TEST_F(AOTITorchReinterpretTensorSlimTest, BasicView_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
 
-  Tensor* reinterpreted_tensor;
+  std::vector<int64_t> sizes = {2, 3, 4};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
+  EXPECT_TRUE(orig_tensor->is_cuda());
+
+  std::vector<int64_t> new_sizes = {6, 4};
+  std::vector<int64_t> new_strides = {4, 1};
+
+  Tensor* view_tensor = nullptr;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
+      orig_tensor,
       new_sizes.size(),
       new_sizes.data(),
       new_strides.data(),
-      0, // storage_offset
-      &reinterpreted_tensor);
+      0,
+      &view_tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(reinterpreted_tensor, nullptr);
-
-  // Check that the reinterpreted tensor has the new shape
-  EXPECT_EQ(reinterpreted_tensor->dim(), 2);
-  EXPECT_EQ(reinterpreted_tensor->size(0), 2);
-  EXPECT_EQ(reinterpreted_tensor->size(1), 3);
-
-  // Verify the dtype is preserved as bf16
-  int32_t reinterpreted_dtype = 0;
-  dtype_check_error =
-      aoti_torch_get_dtype(reinterpreted_tensor, &reinterpreted_dtype);
-  EXPECT_EQ(dtype_check_error, Error::Ok);
-  EXPECT_EQ(
-      reinterpreted_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
-      << "Reinterpreted tensor should preserve bfloat16 dtype";
-
-  // CRITICAL: Check that the reinterpreted tensor uses the SAME memory
-  void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr();
-  EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr)
-      << "Reinterpreted tensor should use the same memory as the source tensor";
-
-  // Test memory sharing by writing data through the original tensor
-  // and verifying it's visible through the reinterpreted tensor
-  // Note: bf16 has 2 bytes per element
-  std::vector<uint16_t> test_data_bf16 = {
-      0x3F80, 0x4000, 0x4040, 0x4080, 0x40A0, 0x40C0}; // bf16 values
-  cudaError_t cuda_err = cudaMemcpy(
-      original_data_ptr,
-      test_data_bf16.data(),
-      test_data_bf16.size() * sizeof(uint16_t),
-      cudaMemcpyHostToDevice);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-
-  // Read back through the reinterpreted tensor
-  std::vector<uint16_t> readback_data_bf16(6);
-  cuda_err = cudaMemcpy(
-      readback_data_bf16.data(),
-      reinterpreted_data_ptr,
-      readback_data_bf16.size() * sizeof(uint16_t),
-      cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-
-  // Verify the data matches
-  for (size_t i = 0; i < test_data_bf16.size(); i++) {
-    EXPECT_EQ(readback_data_bf16[i], test_data_bf16[i])
-        << "BF16 data should be the same through both tensors at index " << i;
-  }
+  ASSERT_NE(view_tensor, nullptr);
+  EXPECT_TRUE(view_tensor->is_cuda());
+
+  EXPECT_EQ(view_tensor->dim(), 2);
+  EXPECT_EQ(view_tensor->size(0), 6);
+  EXPECT_EQ(view_tensor->size(1), 4);
+
+  EXPECT_EQ(view_tensor->data_ptr(), orig_tensor->data_ptr());
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
 }
 
-// Test reference counting behavior - memory not in map should fail
-TEST_F(AOTITorchReinterpretTensorTest, MemoryNotInMapShouldFail) {
-  // Create a tensor directly without using our allocation functions
-  // This should NOT be in the reference counting map
-  void* external_memory;
-  ASSERT_EQ(
-      cudaMallocManaged(&external_memory, 12 * sizeof(float)), cudaSuccess);
-
-  // Create a tensor by manually wrapping this memory without going through our
-  // APIs
-  std::vector<int64_t> sizes = {12};
-  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
-
-  // Create the tensor directly using ExecutorTorch extension
-  auto tensor_shared = executorch::extension::from_blob(
-      external_memory,
-      convert_sizes_to_vector(sizes.size(), sizes.data()),
-      convert_strides_to_vector(sizes.size(), sizes.data(), strides.data()),
-      executorch::runtime::etensor::ScalarType::Float);
-
-  ASSERT_TRUE(tensor_shared);
-  Tensor* external_tensor = tensor_shared.get();
-
-  // Try to reinterpret this tensor - should fail because memory is not in map
-  std::vector<int64_t> new_sizes = {3, 4};
-  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
-
-  Tensor* reinterpreted_tensor;
+TEST_F(AOTITorchReinterpretTensorSlimTest, WithStorageOffset_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  std::vector<int64_t> sizes = {4, 4};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
+
+  std::vector<int64_t> new_sizes = {2, 4};
+  std::vector<int64_t> new_strides = {4, 1};
+  int64_t storage_offset = 8;
+
+  Tensor* view_tensor = nullptr;
   AOTITorchError error = aoti_torch__reinterpret_tensor(
-      external_tensor,
+      orig_tensor,
       new_sizes.size(),
       new_sizes.data(),
       new_strides.data(),
-      0, // storage_offset
-      &reinterpreted_tensor);
+      storage_offset,
+      &view_tensor);
 
-  // Should fail because memory is not being tracked by reference counting
-  // system
-  EXPECT_EQ(error, Error::InvalidArgument);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(view_tensor, nullptr);
+  EXPECT_TRUE(view_tensor->is_cuda());
+
+  char* orig_ptr = static_cast<char*>(orig_tensor->data_ptr());
+  char* view_ptr = static_cast<char*>(view_tensor->data_ptr());
+  EXPECT_EQ(view_ptr, orig_ptr + storage_offset * sizeof(float));
 
-  // Clean up the external memory
-  ASSERT_EQ(cudaFree(external_memory), cudaSuccess);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
 }
 
-// Test reference counting behavior - creating view increments reference count
-TEST_F(AOTITorchReinterpretTensorTest, ViewCreationIncrementsReferenceCount) {
-  // Create a source tensor that owns memory (reference count = 1)
-  std::vector<int64_t> source_sizes = {12};
-  Tensor* source_tensor = create_source_tensor(source_sizes);
-  ASSERT_NE(source_tensor, nullptr);
+TEST_F(AOTITorchReinterpretTensorSlimTest, MemorySharing_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
 
-  void* shared_data_ptr = source_tensor->mutable_data_ptr();
-  ASSERT_NE(shared_data_ptr, nullptr);
+  std::vector<int64_t> sizes = {6};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
 
-  // Create first view - should increment reference count to 2
-  std::vector<int64_t> view1_sizes = {3, 4};
-  std::vector<int64_t> view1_strides =
-      calculate_contiguous_strides(view1_sizes);
+  void* orig_ptr = orig_tensor->data_ptr();
 
-  Tensor* view1_tensor;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      source_tensor,
-      view1_sizes.size(),
-      view1_sizes.data(),
-      view1_strides.data(),
-      0,
-      &view1_tensor);
+  std::vector<int64_t> new_sizes = {2, 3};
+  std::vector<int64_t> new_strides = {3, 1};
 
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view1_tensor, nullptr);
-  EXPECT_EQ(view1_tensor->mutable_data_ptr(), shared_data_ptr);
-
-  // Create second view - should increment reference count to 3
-  std::vector<int64_t> view2_sizes = {2, 6};
-  std::vector<int64_t> view2_strides =
-      calculate_contiguous_strides(view2_sizes);
-
-  Tensor* view2_tensor;
-  error = aoti_torch__reinterpret_tensor(
-      source_tensor,
-      view2_sizes.size(),
-      view2_sizes.data(),
-      view2_strides.data(),
+  Tensor* view_tensor = nullptr;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      orig_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
       0,
-      &view2_tensor);
+      &view_tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view2_tensor, nullptr);
-  EXPECT_EQ(view2_tensor->mutable_data_ptr(), shared_data_ptr);
+  ASSERT_NE(view_tensor, nullptr);
 
-  // Now delete the source tensor - memory should NOT be freed (reference count
-  // = 2)
-  error = aoti_torch_delete_tensor_object(source_tensor);
-  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(view_tensor->data_ptr(), orig_ptr);
 
-  // Both views should still be valid - test by accessing memory
-  float test_value = 42.0f;
-  cudaError_t cuda_err = cudaMemcpy(
-      shared_data_ptr, &test_value, sizeof(float), cudaMemcpyHostToDevice);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-
-  float readback_value = 0.0f;
-  cuda_err = cudaMemcpy(
-      &readback_value,
-      view1_tensor->mutable_data_ptr(),
-      sizeof(float),
-      cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-  EXPECT_EQ(readback_value, test_value);
-
-  // Delete first view - memory should still NOT be freed (reference count = 1)
-  error = aoti_torch_delete_tensor_object(view1_tensor);
-  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(view_tensor->data_ptr(), orig_ptr);
 
-  // Second view should still be valid
-  readback_value = 0.0f;
-  cuda_err = cudaMemcpy(
-      &readback_value,
-      view2_tensor->mutable_data_ptr(),
-      sizeof(float),
-      cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-  EXPECT_EQ(readback_value, test_value);
-
-  // Delete second view - NOW memory should be freed (reference count = 0)
-  error = aoti_torch_delete_tensor_object(view2_tensor);
-  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
 }
 
-// Test reference counting behavior with NOT_OWN memory (from blob) - should
-// SUCCEED and keep NOT_OWN
-TEST_F(AOTITorchReinterpretTensorTest, ViewOfNotOwnMemoryKeepsNotOwnStatus) {
-  // Allocate external memory
-  void* external_memory;
-  cudaError_t cuda_err =
-      cudaMallocManaged(&external_memory, 12 * sizeof(float));
-  ASSERT_EQ(cuda_err, cudaSuccess);
-
-  // Create tensor from blob (which marks memory as NOT_OWN)
-  std::vector<int64_t> blob_sizes = {12};
-  std::vector<int64_t> blob_strides = calculate_contiguous_strides(blob_sizes);
-
-  Tensor* blob_tensor;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      external_memory,
-      blob_sizes.size(),
-      blob_sizes.data(),
-      blob_strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device_index
-      &blob_tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
+TEST_F(AOTITorchReinterpretTensorSlimTest, ChainedViews_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  std::vector<int64_t> sizes = {24};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
 
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(blob_tensor, nullptr);
-
-  // Create view of NOT_OWN memory - should SUCCEED and keep NOT_OWN status
-  std::vector<int64_t> view_sizes = {3, 4};
-  std::vector<int64_t> view_strides = calculate_contiguous_strides(view_sizes);
-
-  Tensor* view_tensor;
-  error = aoti_torch__reinterpret_tensor(
-      blob_tensor,
-      view_sizes.size(),
-      view_sizes.data(),
-      view_strides.data(),
-      0,
-      &view_tensor);
+  void* orig_ptr = orig_tensor->data_ptr();
 
-  // Should succeed - NOT_OWN memory can be reinterpreted but stays NOT_OWN
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-  EXPECT_EQ(view_tensor->mutable_data_ptr(), external_memory);
-
-  // Verify both tensors share the same memory
-  EXPECT_EQ(blob_tensor->mutable_data_ptr(), view_tensor->mutable_data_ptr());
-
-  // Test memory sharing by writing data through one tensor and reading through
-  // the other
-  float test_value = 42.0f;
-  cuda_err = cudaMemcpy(
-      external_memory, &test_value, sizeof(float), cudaMemcpyHostToDevice);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-
-  float readback_value = 0.0f;
-  cuda_err = cudaMemcpy(
-      &readback_value,
-      view_tensor->mutable_data_ptr(),
-      sizeof(float),
-      cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-  EXPECT_EQ(readback_value, test_value);
-
-  // Delete the blob tensor - external memory should NOT be freed (NOT_OWN
-  // behavior)
-  error = aoti_torch_delete_tensor_object(blob_tensor);
-  EXPECT_EQ(error, Error::Ok);
+  std::vector<int64_t> sizes1 = {4, 6};
+  std::vector<int64_t> strides1 = {6, 1};
 
-  // View tensor should still be valid - test by accessing memory
-  readback_value = 0.0f;
-  cuda_err = cudaMemcpy(
-      &readback_value,
-      view_tensor->mutable_data_ptr(),
-      sizeof(float),
-      cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-  EXPECT_EQ(readback_value, test_value);
-
-  // Delete view tensor - external memory should still NOT be freed (NOT_OWN
-  // behavior)
-  error = aoti_torch_delete_tensor_object(view_tensor);
-  EXPECT_EQ(error, Error::Ok);
+  Tensor* view1 = nullptr;
+  EXPECT_EQ(
+      aoti_torch__reinterpret_tensor(
+          orig_tensor,
+          sizes1.size(),
+          sizes1.data(),
+          strides1.data(),
+          0,
+          &view1),
+      Error::Ok);
+
+  std::vector<int64_t> sizes2 = {2, 2, 6};
+  std::vector<int64_t> strides2 = {12, 6, 1};
+
+  Tensor* view2 = nullptr;
+  EXPECT_EQ(
+      aoti_torch__reinterpret_tensor(
+          view1, sizes2.size(), sizes2.data(), strides2.data(), 0, &view2),
+      Error::Ok);
 
-  // External memory should still be accessible (proves neither tensor freed it)
-  readback_value = 0.0f;
-  cuda_err = cudaMemcpy(
-      &readback_value, external_memory, sizeof(float), cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-  EXPECT_EQ(readback_value, test_value);
+  EXPECT_EQ(view1->data_ptr(), orig_ptr);
+  EXPECT_EQ(view2->data_ptr(), orig_ptr);
 
-  // Clean up external memory manually (as expected for NOT_OWN memory)
-  ASSERT_EQ(cudaFree(external_memory), cudaSuccess);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view1), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(view2), Error::Ok);
 }
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor_slim.cpp
deleted file mode 100644
index d2ad645136e..00000000000
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor_slim.cpp
+++ /dev/null
@@ -1,692 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <vector>
-
-#include <executorch/backends/aoti/slim/c10/core/Device.h>
-#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
-#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
-
-using namespace executorch::backends::cuda;
-using executorch::runtime::Error;
-
-namespace slim_c10 = executorch::backends::aoti::slim::c10;
-
-namespace {
-
-bool isCudaAvailable() {
-  int device_count = 0;
-  cudaError_t err = cudaGetDeviceCount(&device_count);
-  return (err == cudaSuccess && device_count > 0);
-}
-
-std::vector<int64_t> calculateContiguousStrides(
-    const std::vector<int64_t>& sizes) {
-  std::vector<int64_t> strides(sizes.size());
-  if (sizes.empty()) {
-    return strides;
-  }
-  strides[sizes.size() - 1] = 1;
-  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-    strides[i] = strides[i + 1] * sizes[i + 1];
-  }
-  return strides;
-}
-
-} // namespace
-
-class AOTITorchReinterpretTensorSlimTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    et_pal_init();
-  }
-
-  Tensor* createTestTensor(
-      const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides = {},
-      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
-      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      int32_t device_index = 0) {
-    Tensor* tensor = nullptr;
-
-    std::vector<int64_t> effective_strides = strides;
-    if (strides.empty()) {
-      effective_strides = calculateContiguousStrides(sizes);
-    }
-
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        effective_strides.data(),
-        dtype,
-        device_type,
-        device_index,
-        &tensor);
-
-    return (error == Error::Ok) ? tensor : nullptr;
-  }
-};
-
-// ============================================================================
-// Basic Functionality Tests
-// ============================================================================
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, BasicView_CPU) {
-  std::vector<int64_t> sizes = {2, 3, 4};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  std::vector<int64_t> new_sizes = {6, 4};
-  std::vector<int64_t> new_strides = {4, 1};
-  int64_t storage_offset = 0;
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      storage_offset,
-      &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-
-  EXPECT_EQ(view_tensor->dim(), 2);
-  EXPECT_EQ(view_tensor->size(0), 6);
-  EXPECT_EQ(view_tensor->size(1), 4);
-  EXPECT_EQ(view_tensor->stride(0), 4);
-  EXPECT_EQ(view_tensor->stride(1), 1);
-
-  EXPECT_EQ(view_tensor->data_ptr(), orig_tensor->data_ptr());
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, NullSelf) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = {3, 1};
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      nullptr, sizes.size(), sizes.data(), strides.data(), 0, &view_tensor);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, NullReturnPointer) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  std::vector<int64_t> new_sizes = {6};
-  std::vector<int64_t> new_strides = {1};
-
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      0,
-      nullptr);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, NegativeNdim) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  std::vector<int64_t> new_sizes = {6};
-  std::vector<int64_t> new_strides = {1};
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor, -1, new_sizes.data(), new_strides.data(), 0, &view_tensor);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-}
-
-// ============================================================================
-// Storage Offset Tests
-// ============================================================================
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, WithStorageOffset_CPU) {
-  std::vector<int64_t> sizes = {4, 4};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  std::vector<int64_t> new_sizes = {2, 4};
-  std::vector<int64_t> new_strides = {4, 1};
-  int64_t storage_offset = 4; // Skip first row
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      storage_offset,
-      &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-
-  EXPECT_EQ(view_tensor->dim(), 2);
-  EXPECT_EQ(view_tensor->size(0), 2);
-  EXPECT_EQ(view_tensor->size(1), 4);
-
-  char* orig_ptr = static_cast<char*>(orig_tensor->data_ptr());
-  char* view_ptr = static_cast<char*>(view_tensor->data_ptr());
-  EXPECT_EQ(view_ptr, orig_ptr + storage_offset * sizeof(float));
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-// ============================================================================
-// Memory Sharing Tests
-// ============================================================================
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, MemorySharing_CPU) {
-  std::vector<int64_t> sizes = {6};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-
-  std::vector<int64_t> new_sizes = {2, 3};
-  std::vector<int64_t> new_strides = {3, 1};
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      0,
-      &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-
-  EXPECT_EQ(view_tensor->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-
-  EXPECT_EQ(view_tensor->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, MultipleViews_CPU) {
-  std::vector<int64_t> sizes = {24};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-
-  std::vector<int64_t> sizes1 = {2, 12};
-  std::vector<int64_t> strides1 = {12, 1};
-
-  std::vector<int64_t> sizes2 = {4, 6};
-  std::vector<int64_t> strides2 = {6, 1};
-
-  std::vector<int64_t> sizes3 = {2, 3, 4};
-  std::vector<int64_t> strides3 = {12, 4, 1};
-
-  Tensor* view1 = nullptr;
-  Tensor* view2 = nullptr;
-  Tensor* view3 = nullptr;
-
-  EXPECT_EQ(
-      aoti_torch__reinterpret_tensor(
-          orig_tensor,
-          sizes1.size(),
-          sizes1.data(),
-          strides1.data(),
-          0,
-          &view1),
-      Error::Ok);
-  EXPECT_EQ(
-      aoti_torch__reinterpret_tensor(
-          orig_tensor,
-          sizes2.size(),
-          sizes2.data(),
-          strides2.data(),
-          0,
-          &view2),
-      Error::Ok);
-  EXPECT_EQ(
-      aoti_torch__reinterpret_tensor(
-          orig_tensor,
-          sizes3.size(),
-          sizes3.data(),
-          strides3.data(),
-          0,
-          &view3),
-      Error::Ok);
-
-  EXPECT_EQ(view1->data_ptr(), orig_ptr);
-  EXPECT_EQ(view2->data_ptr(), orig_ptr);
-  EXPECT_EQ(view3->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-
-  EXPECT_EQ(view1->data_ptr(), orig_ptr);
-  EXPECT_EQ(view2->data_ptr(), orig_ptr);
-  EXPECT_EQ(view3->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view1), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view2), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view3), Error::Ok);
-}
-
-// ============================================================================
-// Dimension Change Tests
-// ============================================================================
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, ExpandDimensions_CPU) {
-  std::vector<int64_t> sizes = {6};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-  EXPECT_EQ(orig_tensor->dim(), 1);
-
-  std::vector<int64_t> new_sizes = {2, 3};
-  std::vector<int64_t> new_strides = {3, 1};
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      0,
-      &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-  EXPECT_EQ(view_tensor->dim(), 2);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, CollapseDimensions_CPU) {
-  std::vector<int64_t> sizes = {2, 3, 4};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-  EXPECT_EQ(orig_tensor->dim(), 3);
-
-  std::vector<int64_t> new_sizes = {24};
-  std::vector<int64_t> new_strides = {1};
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      0,
-      &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-  EXPECT_EQ(view_tensor->dim(), 1);
-  EXPECT_EQ(view_tensor->numel(), 24);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, ScalarTensorView_CPU) {
-  std::vector<int64_t> sizes = {1};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  std::vector<int64_t> new_sizes = {};
-  std::vector<int64_t> new_strides = {};
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor, 0, new_sizes.data(), new_strides.data(), 0, &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-  EXPECT_EQ(view_tensor->dim(), 0);
-  EXPECT_EQ(view_tensor->numel(), 1);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-// ============================================================================
-// Stride Tests
-// ============================================================================
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, TransposeViaStrides_CPU) {
-  std::vector<int64_t> sizes = {3, 4};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  std::vector<int64_t> new_sizes = {4, 3};
-  std::vector<int64_t> new_strides = {1, 4};
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      0,
-      &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-  EXPECT_EQ(view_tensor->size(0), 4);
-  EXPECT_EQ(view_tensor->size(1), 3);
-  EXPECT_EQ(view_tensor->stride(0), 1);
-  EXPECT_EQ(view_tensor->stride(1), 4);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-// ============================================================================
-// Different Dtype Tests
-// ============================================================================
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, Int64Tensor_CPU) {
-  std::vector<int64_t> sizes = {6};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Long),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  std::vector<int64_t> new_sizes = {2, 3};
-  std::vector<int64_t> new_strides = {3, 1};
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      0,
-      &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-  EXPECT_EQ(view_tensor->itemsize(), 8);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, BFloat16Tensor_CPU) {
-  std::vector<int64_t> sizes = {6};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::BFloat16),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  std::vector<int64_t> new_sizes = {2, 3};
-  std::vector<int64_t> new_strides = {3, 1};
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      0,
-      &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-  EXPECT_EQ(view_tensor->itemsize(), 2);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-// ============================================================================
-// CUDA Tests
-// ============================================================================
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, BasicView_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3, 4};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-  EXPECT_TRUE(orig_tensor->is_cuda());
-
-  std::vector<int64_t> new_sizes = {6, 4};
-  std::vector<int64_t> new_strides = {4, 1};
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      0,
-      &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-  EXPECT_TRUE(view_tensor->is_cuda());
-
-  EXPECT_EQ(view_tensor->dim(), 2);
-  EXPECT_EQ(view_tensor->size(0), 6);
-  EXPECT_EQ(view_tensor->size(1), 4);
-
-  EXPECT_EQ(view_tensor->data_ptr(), orig_tensor->data_ptr());
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, WithStorageOffset_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {4, 4};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  std::vector<int64_t> new_sizes = {2, 4};
-  std::vector<int64_t> new_strides = {4, 1};
-  int64_t storage_offset = 8;
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      storage_offset,
-      &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-  EXPECT_TRUE(view_tensor->is_cuda());
-
-  char* orig_ptr = static_cast<char*>(orig_tensor->data_ptr());
-  char* view_ptr = static_cast<char*>(view_tensor->data_ptr());
-  EXPECT_EQ(view_ptr, orig_ptr + storage_offset * sizeof(float));
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, MemorySharing_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {6};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-
-  std::vector<int64_t> new_sizes = {2, 3};
-  std::vector<int64_t> new_strides = {3, 1};
-
-  Tensor* view_tensor = nullptr;
-  AOTITorchError error = aoti_torch__reinterpret_tensor(
-      orig_tensor,
-      new_sizes.size(),
-      new_sizes.data(),
-      new_strides.data(),
-      0,
-      &view_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(view_tensor, nullptr);
-
-  EXPECT_EQ(view_tensor->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(view_tensor->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchReinterpretTensorSlimTest, ChainedViews_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {24};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-
-  std::vector<int64_t> sizes1 = {4, 6};
-  std::vector<int64_t> strides1 = {6, 1};
-
-  Tensor* view1 = nullptr;
-  EXPECT_EQ(
-      aoti_torch__reinterpret_tensor(
-          orig_tensor,
-          sizes1.size(),
-          sizes1.data(),
-          strides1.data(),
-          0,
-          &view1),
-      Error::Ok);
-
-  std::vector<int64_t> sizes2 = {2, 2, 6};
-  std::vector<int64_t> strides2 = {12, 6, 1};
-
-  Tensor* view2 = nullptr;
-  EXPECT_EQ(
-      aoti_torch__reinterpret_tensor(
-          view1, sizes2.size(), sizes2.data(), strides2.data(), 0, &view2),
-      Error::Ok);
-
-  EXPECT_EQ(view1->data_ptr(), orig_ptr);
-  EXPECT_EQ(view2->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view1), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(view2), Error::Ok);
-}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out.cpp
index d5e1bcb2547..f01743745d2 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out.cpp
@@ -7,239 +7,431 @@
  */
 
 #include <cuda_runtime.h>
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/cuda/runtime/shims/memory.h>
-#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/utils.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
 #include <vector>
 
-using namespace executorch::backends::aoti;
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
+#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+
 using namespace executorch::backends::cuda;
-using namespace executorch::runtime;
-using executorch::runtime::etensor::Tensor;
+using executorch::runtime::Error;
 
-// Test fixture for aoti_torch_assign_tensors_out tests
-class AOTITorchAssignTensorsOutTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Initialize ExecuTorch Platform Abstraction Layer
-    et_pal_init();
+namespace slim_c10 = executorch::backends::aoti::slim::c10;
 
-    // Check if CUDA is available
-    int device_count = 0;
-    cudaError_t err = cudaGetDeviceCount(&device_count);
-    if (err != cudaSuccess || device_count == 0) {
-      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
-    }
+namespace {
 
-    // Clean up any existing cached metadata before each test
-    cleanup_tensor_metadata();
+bool isCudaAvailable() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  return (err == cudaSuccess && device_count > 0);
+}
 
-    // Clear any remaining tensors from previous tests
-    clear_all_tensors();
+std::vector<int64_t> calculateContiguousStrides(
+    const std::vector<int64_t>& sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  if (sizes.empty()) {
+    return strides;
+  }
+  strides[sizes.size() - 1] = 1;
+  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * sizes[i + 1];
   }
+  return strides;
+}
 
-  void TearDown() override {
-    // Clean up metadata
-    cleanup_tensor_metadata();
+} // namespace
 
-    // Clear the global tensor storage using the provided function
-    clear_all_tensors();
+class AOTITorchAssignTensorsOutSlimTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
   }
 
-  // Helper to create a test tensor
-  Tensor* create_test_tensor(
+  Tensor* createTestTensor(
       const std::vector<int64_t>& sizes,
-      int32_t dtype = static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      int32_t device_type = static_cast<int32_t>(SupportedDevices::CUDA)) {
-    std::vector<int64_t> strides;
-    // Calculate contiguous strides
-    if (!sizes.empty()) {
-      strides.resize(sizes.size());
-      strides[sizes.size() - 1] = 1;
-      for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-        strides[i] = strides[i + 1] * sizes[i + 1];
-      }
+      const std::vector<int64_t>& strides = {},
+      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
+      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      int32_t device_index = 0) {
+    Tensor* tensor = nullptr;
+
+    std::vector<int64_t> effective_strides = strides;
+    if (strides.empty()) {
+      effective_strides = calculateContiguousStrides(sizes);
     }
 
-    Tensor* tensor;
-    const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data();
-
     AOTITorchError error = aoti_torch_empty_strided(
         sizes.size(),
         sizes.data(),
-        strides_ptr,
+        effective_strides.data(),
         dtype,
         device_type,
-        0,
+        device_index,
         &tensor);
 
     return (error == Error::Ok) ? tensor : nullptr;
   }
 };
 
-// Test basic functionality
-TEST_F(AOTITorchAssignTensorsOutTest, BasicFunctionality) {
-  // Create a source tensor
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* src = create_test_tensor(sizes);
-  ASSERT_NE(src, nullptr);
+// ============================================================================
+// Basic Functionality Tests
+// ============================================================================
 
-  // Create output tensor handle
-  Tensor* dst = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst);
+TEST_F(AOTITorchAssignTensorsOutSlimTest, BasicFunctionality_CPU) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* src_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src_tensor, nullptr);
+
+  // Store expected properties before move
+  int64_t expected_dim = src_tensor->dim();
+  int64_t expected_size0 = src_tensor->size(0);
+  int64_t expected_size1 = src_tensor->size(1);
+  size_t expected_numel = src_tensor->numel();
+  void* expected_data_ptr = src_tensor->data_ptr();
+
+  Tensor* dst_tensor = nullptr;
+  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(dst, nullptr);
+  ASSERT_NE(dst_tensor, nullptr);
+
+  // Verify destination tensor has the moved properties
+  EXPECT_EQ(dst_tensor->dim(), expected_dim);
+  EXPECT_EQ(dst_tensor->size(0), expected_size0);
+  EXPECT_EQ(dst_tensor->size(1), expected_size1);
+  EXPECT_EQ(dst_tensor->numel(), expected_numel);
+  EXPECT_EQ(dst_tensor->data_ptr(), expected_data_ptr);
+
+  // Source tensor is now in undefined state after move - just delete it
+  // (accessing src_tensor properties is undefined behavior after move)
+  delete src_tensor; // Direct delete since it's in undefined state
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
+}
+
+TEST_F(AOTITorchAssignTensorsOutSlimTest, NullSrc) {
+  Tensor* dst_tensor = nullptr;
+  AOTITorchError error = aoti_torch_assign_tensors_out(nullptr, &dst_tensor);
 
-  // Verify the output tensor has the same properties as source
-  EXPECT_EQ(dst->dim(), src->dim());
-  EXPECT_EQ(dst->size(0), src->size(0));
-  EXPECT_EQ(dst->size(1), src->size(1));
-  EXPECT_EQ(dst->numel(), src->numel());
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchAssignTensorsOutSlimTest, NullDst) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* src_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src_tensor, nullptr);
+
+  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, nullptr);
 
-  // Verify they share the same memory
-  EXPECT_EQ(dst->mutable_data_ptr(), src->mutable_data_ptr());
+  EXPECT_EQ(error, Error::InvalidArgument);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
 }
 
-// Test with 1D tensor
-TEST_F(AOTITorchAssignTensorsOutTest, OneDimensionalTensor) {
-  std::vector<int64_t> sizes = {10};
-  Tensor* src = create_test_tensor(sizes);
-  ASSERT_NE(src, nullptr);
+// ============================================================================
+// Move Semantics Tests
+// ============================================================================
 
-  Tensor* dst = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst);
+TEST_F(AOTITorchAssignTensorsOutSlimTest, SourceBecamesUndefinedAfterMove_CPU) {
+  std::vector<int64_t> sizes = {3, 4};
+  Tensor* src_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src_tensor, nullptr);
 
+  void* original_ptr = src_tensor->data_ptr();
+  ASSERT_NE(original_ptr, nullptr);
+
+  Tensor* dst_tensor = nullptr;
+  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(dst, nullptr);
-  EXPECT_EQ(dst->dim(), 1);
-  EXPECT_EQ(dst->size(0), 10);
-  EXPECT_EQ(dst->mutable_data_ptr(), src->mutable_data_ptr());
-}
+  ASSERT_NE(dst_tensor, nullptr);
 
-// Test with 3D tensor
-TEST_F(AOTITorchAssignTensorsOutTest, ThreeDimensionalTensor) {
-  std::vector<int64_t> sizes = {2, 3, 4};
-  Tensor* src = create_test_tensor(sizes);
-  ASSERT_NE(src, nullptr);
+  // Destination has the original pointer
+  EXPECT_EQ(dst_tensor->data_ptr(), original_ptr);
+
+  // Source tensor is now in undefined state - verify it's no longer defined
+  EXPECT_FALSE(src_tensor->defined());
 
-  Tensor* dst = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst);
+  // Clean up - delete in this order since src is undefined
+  delete src_tensor;
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
+}
 
+// ============================================================================
+// Tensor Property Tests
+// ============================================================================
+
+TEST_F(AOTITorchAssignTensorsOutSlimTest, CustomStrides_CPU) {
+  std::vector<int64_t> sizes = {3, 4};
+  std::vector<int64_t> strides = {4, 1};
+  Tensor* src_tensor = createTestTensor(
+      sizes,
+      strides,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src_tensor, nullptr);
+
+  // Store expected strides before move
+  int64_t expected_stride0 = src_tensor->stride(0);
+  int64_t expected_stride1 = src_tensor->stride(1);
+
+  Tensor* dst_tensor = nullptr;
+  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(dst, nullptr);
-  EXPECT_EQ(dst->dim(), 3);
-  EXPECT_EQ(dst->size(0), 2);
-  EXPECT_EQ(dst->size(1), 3);
-  EXPECT_EQ(dst->size(2), 4);
-  EXPECT_EQ(dst->mutable_data_ptr(), src->mutable_data_ptr());
+  ASSERT_NE(dst_tensor, nullptr);
+
+  // Verify destination has the expected strides
+  EXPECT_EQ(dst_tensor->stride(0), expected_stride0);
+  EXPECT_EQ(dst_tensor->stride(1), expected_stride1);
+
+  delete src_tensor; // Source is undefined after move
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
 }
 
-// Test with scalar (0D) tensor
-TEST_F(AOTITorchAssignTensorsOutTest, ScalarTensor) {
+TEST_F(AOTITorchAssignTensorsOutSlimTest, ScalarTensor_CPU) {
   std::vector<int64_t> sizes = {};
-  Tensor* src = create_test_tensor(sizes);
-  ASSERT_NE(src, nullptr);
+  Tensor* src_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src_tensor, nullptr);
+  EXPECT_EQ(src_tensor->dim(), 0);
+
+  Tensor* dst_tensor = nullptr;
+  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(dst_tensor, nullptr);
 
-  Tensor* dst = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst);
+  EXPECT_EQ(dst_tensor->dim(), 0);
+  EXPECT_EQ(dst_tensor->numel(), 1);
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(dst, nullptr);
-  EXPECT_EQ(dst->dim(), 0);
-  EXPECT_EQ(dst->mutable_data_ptr(), src->mutable_data_ptr());
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
 }
 
-// Test with null source pointer
-TEST_F(AOTITorchAssignTensorsOutTest, NullSourcePointer) {
-  Tensor* dst = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(nullptr, &dst);
-  EXPECT_EQ(error, Error::InvalidArgument);
+TEST_F(AOTITorchAssignTensorsOutSlimTest, LargeMultiDimensionalTensor_CPU) {
+  std::vector<int64_t> sizes = {10, 20, 30};
+  Tensor* src_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src_tensor, nullptr);
+
+  Tensor* dst_tensor = nullptr;
+  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(dst_tensor, nullptr);
+
+  EXPECT_EQ(dst_tensor->dim(), 3);
+  EXPECT_EQ(dst_tensor->size(0), 10);
+  EXPECT_EQ(dst_tensor->size(1), 20);
+  EXPECT_EQ(dst_tensor->size(2), 30);
+  EXPECT_EQ(dst_tensor->numel(), 6000);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
 }
 
-// Test with null destination pointer
-TEST_F(AOTITorchAssignTensorsOutTest, NullDestinationPointer) {
+// ============================================================================
+// Different Dtype Tests
+// ============================================================================
+
+TEST_F(AOTITorchAssignTensorsOutSlimTest, Int64Tensor_CPU) {
   std::vector<int64_t> sizes = {2, 3};
-  Tensor* src = create_test_tensor(sizes);
-  ASSERT_NE(src, nullptr);
+  Tensor* src_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Long),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src_tensor, nullptr);
+
+  Tensor* dst_tensor = nullptr;
+  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(dst_tensor, nullptr);
 
-  AOTITorchError error = aoti_torch_assign_tensors_out(src, nullptr);
-  EXPECT_EQ(error, Error::InvalidArgument);
+  EXPECT_EQ(dst_tensor->itemsize(), 8);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
 }
 
-// Test that strides are preserved
-TEST_F(AOTITorchAssignTensorsOutTest, StridesPreserved) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* src = create_test_tensor(sizes);
-  ASSERT_NE(src, nullptr);
+TEST_F(AOTITorchAssignTensorsOutSlimTest, BFloat16Tensor_CPU) {
+  std::vector<int64_t> sizes = {2, 3, 4};
+  Tensor* src_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::BFloat16),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src_tensor, nullptr);
+
+  Tensor* dst_tensor = nullptr;
+  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(dst_tensor, nullptr);
 
-  Tensor* dst = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst);
+  EXPECT_EQ(dst_tensor->itemsize(), 2);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
+}
 
+TEST_F(AOTITorchAssignTensorsOutSlimTest, BoolTensor_CPU) {
+  std::vector<int64_t> sizes = {4};
+  Tensor* src_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Bool),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src_tensor, nullptr);
+
+  Tensor* dst_tensor = nullptr;
+  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(dst, nullptr);
+  ASSERT_NE(dst_tensor, nullptr);
 
-  // Get strides from both tensors
-  int64_t* src_strides;
-  int64_t* dst_strides;
-  aoti_torch_get_strides(src, &src_strides);
-  aoti_torch_get_strides(dst, &dst_strides);
+  EXPECT_EQ(dst_tensor->itemsize(), 1);
 
-  // Verify strides match
-  for (int64_t i = 0; i < src->dim(); i++) {
-    EXPECT_EQ(src_strides[i], dst_strides[i]);
-  }
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
 }
 
-// Test with CPU tensor
-TEST_F(AOTITorchAssignTensorsOutTest, CPUTensor) {
+// ============================================================================
+// CUDA Tests
+// ============================================================================
+
+TEST_F(AOTITorchAssignTensorsOutSlimTest, BasicFunctionality_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
   std::vector<int64_t> sizes = {2, 3};
-  Tensor* src = create_test_tensor(
+  Tensor* src_tensor = createTestTensor(
       sizes,
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CPU));
-  ASSERT_NE(src, nullptr);
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(src_tensor, nullptr);
+  EXPECT_TRUE(src_tensor->is_cuda());
 
-  Tensor* dst = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst);
+  // Store expected properties before move
+  void* expected_data_ptr = src_tensor->data_ptr();
 
+  Tensor* dst_tensor = nullptr;
+  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(dst_tensor, nullptr);
+  EXPECT_TRUE(dst_tensor->is_cuda());
+  EXPECT_EQ(dst_tensor->data_ptr(), expected_data_ptr);
+
+  // Source is undefined after move
+  EXPECT_FALSE(src_tensor->defined());
+
+  delete src_tensor;
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
+}
+
+TEST_F(
+    AOTITorchAssignTensorsOutSlimTest,
+    SourceBecamesUndefinedAfterMove_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  std::vector<int64_t> sizes = {3, 4};
+  Tensor* src_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(src_tensor, nullptr);
+
+  void* original_ptr = src_tensor->data_ptr();
+  ASSERT_NE(original_ptr, nullptr);
+
+  Tensor* dst_tensor = nullptr;
+  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(dst, nullptr);
-  EXPECT_EQ(dst->mutable_data_ptr(), src->mutable_data_ptr());
+  ASSERT_NE(dst_tensor, nullptr);
+
+  // Destination has the original pointer
+  EXPECT_EQ(dst_tensor->data_ptr(), original_ptr);
+
+  // Source tensor is now in undefined state
+  EXPECT_FALSE(src_tensor->defined());
+
+  delete src_tensor;
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
 }
 
-// Test dtype is preserved
-TEST_F(AOTITorchAssignTensorsOutTest, DtypePreserved) {
-  // Test with different dtypes
-  std::vector<int32_t> dtypes = {
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDTypes::INT32),
-      static_cast<int32_t>(SupportedDTypes::INT64),
-  };
-
-  for (int32_t dtype : dtypes) {
-    cleanup_tensor_metadata();
-    clear_all_tensors();
-
-    std::vector<int64_t> sizes = {2, 3};
-    Tensor* src = create_test_tensor(sizes, dtype);
-    ASSERT_NE(src, nullptr);
-
-    Tensor* dst = nullptr;
-    AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst);
-
-    EXPECT_EQ(error, Error::Ok);
-    EXPECT_NE(dst, nullptr);
-
-    // Verify dtype is preserved
-    int32_t src_dtype, dst_dtype;
-    aoti_torch_get_dtype(src, &src_dtype);
-    aoti_torch_get_dtype(dst, &dst_dtype);
-    EXPECT_EQ(src_dtype, dst_dtype)
-        << "Dtype mismatch for dtype code: " << dtype;
+// ============================================================================
+// Mixed Device Tests
+// ============================================================================
+
+TEST_F(AOTITorchAssignTensorsOutSlimTest, MixedDeviceAssignments) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
   }
+
+  std::vector<int64_t> sizes = {2, 3};
+
+  Tensor* cpu_src = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(cpu_src, nullptr);
+  EXPECT_TRUE(cpu_src->is_cpu());
+
+  Tensor* cuda_src = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(cuda_src, nullptr);
+  EXPECT_TRUE(cuda_src->is_cuda());
+
+  Tensor* cpu_dst = nullptr;
+  Tensor* cuda_dst = nullptr;
+
+  EXPECT_EQ(aoti_torch_assign_tensors_out(cpu_src, &cpu_dst), Error::Ok);
+  EXPECT_EQ(aoti_torch_assign_tensors_out(cuda_src, &cuda_dst), Error::Ok);
+
+  EXPECT_TRUE(cpu_dst->is_cpu());
+  EXPECT_TRUE(cuda_dst->is_cuda());
+  EXPECT_NE(cpu_dst->data_ptr(), cuda_dst->data_ptr());
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_src), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_src), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_dst), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_dst), Error::Ok);
 }
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out_slim.cpp
deleted file mode 100644
index f01743745d2..00000000000
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out_slim.cpp
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <vector>
-
-#include <executorch/backends/aoti/slim/c10/core/Device.h>
-#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
-#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
-
-using namespace executorch::backends::cuda;
-using executorch::runtime::Error;
-
-namespace slim_c10 = executorch::backends::aoti::slim::c10;
-
-namespace {
-
-bool isCudaAvailable() {
-  int device_count = 0;
-  cudaError_t err = cudaGetDeviceCount(&device_count);
-  return (err == cudaSuccess && device_count > 0);
-}
-
-std::vector<int64_t> calculateContiguousStrides(
-    const std::vector<int64_t>& sizes) {
-  std::vector<int64_t> strides(sizes.size());
-  if (sizes.empty()) {
-    return strides;
-  }
-  strides[sizes.size() - 1] = 1;
-  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-    strides[i] = strides[i + 1] * sizes[i + 1];
-  }
-  return strides;
-}
-
-} // namespace
-
-class AOTITorchAssignTensorsOutSlimTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    et_pal_init();
-  }
-
-  Tensor* createTestTensor(
-      const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides = {},
-      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
-      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      int32_t device_index = 0) {
-    Tensor* tensor = nullptr;
-
-    std::vector<int64_t> effective_strides = strides;
-    if (strides.empty()) {
-      effective_strides = calculateContiguousStrides(sizes);
-    }
-
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        effective_strides.data(),
-        dtype,
-        device_type,
-        device_index,
-        &tensor);
-
-    return (error == Error::Ok) ? tensor : nullptr;
-  }
-};
-
-// ============================================================================
-// Basic Functionality Tests
-// ============================================================================
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, BasicFunctionality_CPU) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* src_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src_tensor, nullptr);
-
-  // Store expected properties before move
-  int64_t expected_dim = src_tensor->dim();
-  int64_t expected_size0 = src_tensor->size(0);
-  int64_t expected_size1 = src_tensor->size(1);
-  size_t expected_numel = src_tensor->numel();
-  void* expected_data_ptr = src_tensor->data_ptr();
-
-  Tensor* dst_tensor = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(dst_tensor, nullptr);
-
-  // Verify destination tensor has the moved properties
-  EXPECT_EQ(dst_tensor->dim(), expected_dim);
-  EXPECT_EQ(dst_tensor->size(0), expected_size0);
-  EXPECT_EQ(dst_tensor->size(1), expected_size1);
-  EXPECT_EQ(dst_tensor->numel(), expected_numel);
-  EXPECT_EQ(dst_tensor->data_ptr(), expected_data_ptr);
-
-  // Source tensor is now in undefined state after move - just delete it
-  // (accessing src_tensor properties is undefined behavior after move)
-  delete src_tensor; // Direct delete since it's in undefined state
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, NullSrc) {
-  Tensor* dst_tensor = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(nullptr, &dst_tensor);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, NullDst) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* src_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src_tensor, nullptr);
-
-  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, nullptr);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
-}
-
-// ============================================================================
-// Move Semantics Tests
-// ============================================================================
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, SourceBecamesUndefinedAfterMove_CPU) {
-  std::vector<int64_t> sizes = {3, 4};
-  Tensor* src_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src_tensor, nullptr);
-
-  void* original_ptr = src_tensor->data_ptr();
-  ASSERT_NE(original_ptr, nullptr);
-
-  Tensor* dst_tensor = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(dst_tensor, nullptr);
-
-  // Destination has the original pointer
-  EXPECT_EQ(dst_tensor->data_ptr(), original_ptr);
-
-  // Source tensor is now in undefined state - verify it's no longer defined
-  EXPECT_FALSE(src_tensor->defined());
-
-  // Clean up - delete in this order since src is undefined
-  delete src_tensor;
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
-}
-
-// ============================================================================
-// Tensor Property Tests
-// ============================================================================
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, CustomStrides_CPU) {
-  std::vector<int64_t> sizes = {3, 4};
-  std::vector<int64_t> strides = {4, 1};
-  Tensor* src_tensor = createTestTensor(
-      sizes,
-      strides,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src_tensor, nullptr);
-
-  // Store expected strides before move
-  int64_t expected_stride0 = src_tensor->stride(0);
-  int64_t expected_stride1 = src_tensor->stride(1);
-
-  Tensor* dst_tensor = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(dst_tensor, nullptr);
-
-  // Verify destination has the expected strides
-  EXPECT_EQ(dst_tensor->stride(0), expected_stride0);
-  EXPECT_EQ(dst_tensor->stride(1), expected_stride1);
-
-  delete src_tensor; // Source is undefined after move
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, ScalarTensor_CPU) {
-  std::vector<int64_t> sizes = {};
-  Tensor* src_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src_tensor, nullptr);
-  EXPECT_EQ(src_tensor->dim(), 0);
-
-  Tensor* dst_tensor = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(dst_tensor, nullptr);
-
-  EXPECT_EQ(dst_tensor->dim(), 0);
-  EXPECT_EQ(dst_tensor->numel(), 1);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, LargeMultiDimensionalTensor_CPU) {
-  std::vector<int64_t> sizes = {10, 20, 30};
-  Tensor* src_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src_tensor, nullptr);
-
-  Tensor* dst_tensor = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(dst_tensor, nullptr);
-
-  EXPECT_EQ(dst_tensor->dim(), 3);
-  EXPECT_EQ(dst_tensor->size(0), 10);
-  EXPECT_EQ(dst_tensor->size(1), 20);
-  EXPECT_EQ(dst_tensor->size(2), 30);
-  EXPECT_EQ(dst_tensor->numel(), 6000);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
-}
-
-// ============================================================================
-// Different Dtype Tests
-// ============================================================================
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, Int64Tensor_CPU) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* src_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Long),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src_tensor, nullptr);
-
-  Tensor* dst_tensor = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(dst_tensor, nullptr);
-
-  EXPECT_EQ(dst_tensor->itemsize(), 8);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, BFloat16Tensor_CPU) {
-  std::vector<int64_t> sizes = {2, 3, 4};
-  Tensor* src_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::BFloat16),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src_tensor, nullptr);
-
-  Tensor* dst_tensor = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(dst_tensor, nullptr);
-
-  EXPECT_EQ(dst_tensor->itemsize(), 2);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, BoolTensor_CPU) {
-  std::vector<int64_t> sizes = {4};
-  Tensor* src_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Bool),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src_tensor, nullptr);
-
-  Tensor* dst_tensor = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(dst_tensor, nullptr);
-
-  EXPECT_EQ(dst_tensor->itemsize(), 1);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
-}
-
-// ============================================================================
-// CUDA Tests
-// ============================================================================
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, BasicFunctionality_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* src_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(src_tensor, nullptr);
-  EXPECT_TRUE(src_tensor->is_cuda());
-
-  // Store expected properties before move
-  void* expected_data_ptr = src_tensor->data_ptr();
-
-  Tensor* dst_tensor = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(dst_tensor, nullptr);
-  EXPECT_TRUE(dst_tensor->is_cuda());
-  EXPECT_EQ(dst_tensor->data_ptr(), expected_data_ptr);
-
-  // Source is undefined after move
-  EXPECT_FALSE(src_tensor->defined());
-
-  delete src_tensor;
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
-}
-
-TEST_F(
-    AOTITorchAssignTensorsOutSlimTest,
-    SourceBecamesUndefinedAfterMove_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {3, 4};
-  Tensor* src_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(src_tensor, nullptr);
-
-  void* original_ptr = src_tensor->data_ptr();
-  ASSERT_NE(original_ptr, nullptr);
-
-  Tensor* dst_tensor = nullptr;
-  AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(dst_tensor, nullptr);
-
-  // Destination has the original pointer
-  EXPECT_EQ(dst_tensor->data_ptr(), original_ptr);
-
-  // Source tensor is now in undefined state
-  EXPECT_FALSE(src_tensor->defined());
-
-  delete src_tensor;
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok);
-}
-
-// ============================================================================
-// Mixed Device Tests
-// ============================================================================
-
-TEST_F(AOTITorchAssignTensorsOutSlimTest, MixedDeviceAssignments) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-
-  Tensor* cpu_src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(cpu_src, nullptr);
-  EXPECT_TRUE(cpu_src->is_cpu());
-
-  Tensor* cuda_src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(cuda_src, nullptr);
-  EXPECT_TRUE(cuda_src->is_cuda());
-
-  Tensor* cpu_dst = nullptr;
-  Tensor* cuda_dst = nullptr;
-
-  EXPECT_EQ(aoti_torch_assign_tensors_out(cpu_src, &cpu_dst), Error::Ok);
-  EXPECT_EQ(aoti_torch_assign_tensors_out(cuda_src, &cuda_dst), Error::Ok);
-
-  EXPECT_TRUE(cpu_dst->is_cpu());
-  EXPECT_TRUE(cuda_dst->is_cuda());
-  EXPECT_NE(cpu_dst->data_ptr(), cuda_dst->data_ptr());
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_src), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_src), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_dst), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_dst), Error::Ok);
-}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
index 9fca0f92cf8..c2e67732b41 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
@@ -7,392 +7,481 @@
  */
 
 #include <cuda_runtime.h>
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/cuda/runtime/shims/memory.h>
-#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/utils.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
-#include <cmath>
 #include <vector>
 
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
+#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+
 using namespace executorch::backends::cuda;
-using namespace executorch::backends::aoti;
-using namespace executorch::runtime;
+using executorch::runtime::Error;
 
-// Test fixture for aoti_torch_copy_ tests
-class AOTITorchCopyTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Initialize ExecuTorch Platform Abstraction Layer
-    et_pal_init();
+namespace slim_c10 = executorch::backends::aoti::slim::c10;
 
-    // Check if CUDA is available
-    int device_count = 0;
-    cudaError_t err = cudaGetDeviceCount(&device_count);
-    if (err != cudaSuccess || device_count == 0) {
-      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
-    }
+namespace {
 
-    // Clean up any existing cached metadata before each test
-    cleanup_tensor_metadata();
+bool isCudaAvailable() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  return (err == cudaSuccess && device_count > 0);
+}
 
-    // Clear any remaining tensors from previous tests
-    clear_all_tensors();
+std::vector<int64_t> calculateContiguousStrides(
+    const std::vector<int64_t>& sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  if (sizes.empty()) {
+    return strides;
+  }
+  strides[sizes.size() - 1] = 1;
+  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * sizes[i + 1];
   }
+  return strides;
+}
 
-  void TearDown() override {
-    // Clean up metadata
-    cleanup_tensor_metadata();
+} // namespace
 
-    // Clear the global tensor storage using the provided function
-    clear_all_tensors();
+class AOTITorchCopySlimTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
   }
 
-  // Helper to create test tensors with specific data
-  Tensor* create_test_tensor_with_data(
+  Tensor* createTestTensor(
       const std::vector<int64_t>& sizes,
-      const std::vector<float>& data,
       const std::vector<int64_t>& strides = {},
-      int32_t dtype = static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      int32_t device_type = static_cast<int32_t>(SupportedDevices::CUDA),
+      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
+      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
       int32_t device_index = 0) {
-    Tensor* tensor;
+    Tensor* tensor = nullptr;
 
-    const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data();
+    std::vector<int64_t> effective_strides = strides;
+    if (strides.empty()) {
+      effective_strides = calculateContiguousStrides(sizes);
+    }
 
     AOTITorchError error = aoti_torch_empty_strided(
         sizes.size(),
         sizes.data(),
-        strides_ptr,
+        effective_strides.data(),
         dtype,
         device_type,
         device_index,
         &tensor);
 
-    if (error != Error::Ok || tensor == nullptr) {
-      return nullptr;
-    }
+    return (error == Error::Ok) ? tensor : nullptr;
+  }
+};
 
-    // Fill tensor with data
-    size_t total_bytes = data.size() * sizeof(float);
-    if (device_type == static_cast<int32_t>(SupportedDevices::CUDA)) {
-      cudaError_t memcpy_err = cudaMemcpy(
-          tensor->mutable_data_ptr(),
-          data.data(),
-          total_bytes,
-          cudaMemcpyHostToDevice);
-      // Note: Error is checked but we don't fail the function
-      // This allows tests to proceed and handle errors as needed
-      (void)memcpy_err; // Suppress unused variable warning
-    } else { // CPU
-      std::memcpy(tensor->mutable_data_ptr(), data.data(), total_bytes);
-    }
+// ============================================================================
+// Basic Functionality Tests
+// ============================================================================
 
-    return tensor;
+TEST_F(AOTITorchCopySlimTest, BasicCopy_CPU) {
+  std::vector<int64_t> sizes = {3, 4};
+  Tensor* src = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src, nullptr);
+
+  float* src_data = static_cast<float*>(src->data_ptr());
+  for (int64_t i = 0; i < src->numel(); i++) {
+    src_data[i] = static_cast<float>(i + 1);
   }
 
-  // Helper to get data from tensor
-  std::vector<float> get_tensor_data(Tensor* tensor) {
-    if (!tensor) {
-      return {};
-    }
+  Tensor* dst = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(dst, nullptr);
 
-    size_t num_elements = tensor->numel();
-    std::vector<float> data(num_elements);
-
-    // Determine if this is a CUDA tensor
-    cudaPointerAttributes attributes{};
-    cudaError_t err = cudaPointerGetAttributes(&attributes, tensor->data_ptr());
-    bool is_device =
-        (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice);
-
-    if (is_device) {
-      cudaError_t memcpy_err = cudaMemcpy(
-          data.data(),
-          tensor->data_ptr(),
-          num_elements * sizeof(float),
-          cudaMemcpyDeviceToHost);
-      // Note: Error is checked but we don't fail the function
-      // This allows tests to proceed and handle errors as needed
-      (void)memcpy_err; // Suppress unused variable warning
-    } else {
-      std::memcpy(
-          data.data(), tensor->data_ptr(), num_elements * sizeof(float));
-    }
+  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
+  EXPECT_EQ(error, Error::Ok);
 
-    return data;
+  float* dst_data = static_cast<float*>(dst->data_ptr());
+  for (int64_t i = 0; i < dst->numel(); i++) {
+    EXPECT_FLOAT_EQ(dst_data[i], static_cast<float>(i + 1));
   }
 
-  // Helper to verify two tensors have same data
-  bool tensors_equal(Tensor* a, Tensor* b, float tolerance = 1e-6f) {
-    if (!a || !b) {
-      return false;
-    }
-    if (a->numel() != b->numel()) {
-      return false;
-    }
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
+}
 
-    auto data_a = get_tensor_data(a);
-    auto data_b = get_tensor_data(b);
+TEST_F(AOTITorchCopySlimTest, NullSelf) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* src = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src, nullptr);
 
-    for (size_t i = 0; i < data_a.size(); ++i) {
-      if (std::abs(data_a[i] - data_b[i]) > tolerance) {
-        return false;
-      }
-    }
-    return true;
-  }
-};
+  AOTITorchError error = aoti_torch_copy_(nullptr, src, 0);
+  EXPECT_EQ(error, Error::InvalidArgument);
 
-// Test basic copy functionality - same schema (fast path)
-TEST_F(AOTITorchCopyTest, BasicCopySameSchema) {
-  // Create source tensor with test data
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
+}
+
+TEST_F(AOTITorchCopySlimTest, NullSrc) {
   std::vector<int64_t> sizes = {2, 3};
-  std::vector<float> src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  Tensor* dst = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(dst, nullptr);
 
-  Tensor* src = create_test_tensor_with_data(sizes, src_data);
-  EXPECT_NE(src, nullptr);
+  AOTITorchError error = aoti_torch_copy_(dst, nullptr, 0);
+  EXPECT_EQ(error, Error::InvalidArgument);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
+}
+
+// ============================================================================
+// Different Dtype Tests
+// ============================================================================
+
+TEST_F(AOTITorchCopySlimTest, Int64Copy_CPU) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* src = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Long),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src, nullptr);
+
+  int64_t* src_data = static_cast<int64_t*>(src->data_ptr());
+  for (int64_t i = 0; i < src->numel(); i++) {
+    src_data[i] = i * 100;
+  }
 
-  // Create destination tensor with same schema
-  Tensor* dst =
-      create_test_tensor_with_data(sizes, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
-  EXPECT_NE(dst, nullptr);
+  Tensor* dst = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Long),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(dst, nullptr);
 
-  // Perform copy
   AOTITorchError error = aoti_torch_copy_(dst, src, 0);
   EXPECT_EQ(error, Error::Ok);
 
-  // Verify copy was successful
-  EXPECT_TRUE(tensors_equal(dst, src));
-}
-
-// Test copy with different strides (pointwise fallback)
-TEST_F(AOTITorchCopyTest, CopyDifferentStrides) {
-  // Create source tensor (2x3) with contiguous layout
-  std::vector<int64_t> src_sizes = {2, 3};
-  std::vector<float> src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  int64_t* dst_data = static_cast<int64_t*>(dst->data_ptr());
+  for (int64_t i = 0; i < dst->numel(); i++) {
+    EXPECT_EQ(dst_data[i], i * 100);
+  }
 
-  Tensor* src = create_test_tensor_with_data(src_sizes, src_data);
-  EXPECT_NE(src, nullptr);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
+}
 
-  // Create destination tensor with transposed strides
-  std::vector<int64_t> dst_strides = {1, 2}; // Column-major layout
-  Tensor* dst = create_test_tensor_with_data(
-      src_sizes, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, dst_strides);
-  EXPECT_NE(dst, nullptr);
+TEST_F(AOTITorchCopySlimTest, BoolCopy_CPU) {
+  std::vector<int64_t> sizes = {4};
+  Tensor* src = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Bool),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src, nullptr);
+
+  bool* src_data = static_cast<bool*>(src->data_ptr());
+  src_data[0] = true;
+  src_data[1] = false;
+  src_data[2] = true;
+  src_data[3] = false;
+
+  Tensor* dst = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Bool),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(dst, nullptr);
 
-  // Perform copy - this should use pointwise fallback
   AOTITorchError error = aoti_torch_copy_(dst, src, 0);
   EXPECT_EQ(error, Error::Ok);
 
-  // Verify the copy worked correctly by checking specific elements
-  auto dst_data = get_tensor_data(dst);
-  auto src_data_check = get_tensor_data(src);
+  bool* dst_data = static_cast<bool*>(dst->data_ptr());
+  EXPECT_EQ(dst_data[0], true);
+  EXPECT_EQ(dst_data[1], false);
+  EXPECT_EQ(dst_data[2], true);
+  EXPECT_EQ(dst_data[3], false);
 
-  // For transposed layout, the data should be rearranged
-  EXPECT_EQ(dst_data.size(), 6);
-  EXPECT_EQ(src_data_check.size(), 6);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
 }
 
-// Test copy between CPU and CUDA tensors
-TEST_F(AOTITorchCopyTest, CopyCPUToCUDA) {
-  std::vector<int64_t> sizes = {2, 2};
-  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+// ============================================================================
+// Tensor Shape Tests
+// ============================================================================
 
-  // Create CPU tensor
-  Tensor* cpu_tensor = create_test_tensor_with_data(
+TEST_F(AOTITorchCopySlimTest, ScalarTensorCopy_CPU) {
+  std::vector<int64_t> sizes = {};
+  Tensor* src = createTestTensor(
       sizes,
-      data,
       {},
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CPU)); // CPU
-  EXPECT_NE(cpu_tensor, nullptr);
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src, nullptr);
+  EXPECT_EQ(src->dim(), 0);
+  EXPECT_EQ(src->numel(), 1);
+
+  float* src_data = static_cast<float*>(src->data_ptr());
+  *src_data = 42.0f;
 
-  // Create CUDA tensor
-  Tensor* cuda_tensor = create_test_tensor_with_data(
+  Tensor* dst = createTestTensor(
       sizes,
-      {0.0f, 0.0f, 0.0f, 0.0f},
       {},
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA)); // CUDA
-  EXPECT_NE(cuda_tensor, nullptr);
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(dst, nullptr);
 
-  // Copy from CPU to CUDA
-  AOTITorchError error = aoti_torch_copy_(cuda_tensor, cpu_tensor, 0);
+  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
   EXPECT_EQ(error, Error::Ok);
 
-  // Verify copy
-  EXPECT_TRUE(tensors_equal(cuda_tensor, cpu_tensor));
-}
+  float* dst_data = static_cast<float*>(dst->data_ptr());
+  EXPECT_FLOAT_EQ(*dst_data, 42.0f);
 
-// Test copy between CUDA and CPU tensors
-TEST_F(AOTITorchCopyTest, CopyCUDAToCPU) {
-  std::vector<int64_t> sizes = {2, 2};
-  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
+}
 
-  // Create CUDA tensor
-  Tensor* cuda_tensor = create_test_tensor_with_data(
+TEST_F(AOTITorchCopySlimTest, LargeTensorCopy_CPU) {
+  std::vector<int64_t> sizes = {100, 100};
+  Tensor* src = createTestTensor(
       sizes,
-      data,
       {},
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA)); // CUDA
-  EXPECT_NE(cuda_tensor, nullptr);
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src, nullptr);
+
+  float* src_data = static_cast<float*>(src->data_ptr());
+  for (int64_t i = 0; i < src->numel(); i++) {
+    src_data[i] = static_cast<float>(i);
+  }
 
-  // Create CPU tensor
-  Tensor* cpu_tensor = create_test_tensor_with_data(
+  Tensor* dst = createTestTensor(
       sizes,
-      {0.0f, 0.0f, 0.0f, 0.0f},
       {},
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CPU)); // CPU
-  EXPECT_NE(cpu_tensor, nullptr);
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(dst, nullptr);
 
-  // Copy from CUDA to CPU
-  AOTITorchError error = aoti_torch_copy_(cpu_tensor, cuda_tensor, 0);
+  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
   EXPECT_EQ(error, Error::Ok);
 
-  // Verify copy
-  EXPECT_TRUE(tensors_equal(cpu_tensor, cuda_tensor));
+  float* dst_data = static_cast<float*>(dst->data_ptr());
+  for (int64_t i = 0; i < dst->numel(); i++) {
+    EXPECT_FLOAT_EQ(dst_data[i], static_cast<float>(i));
+  }
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
 }
 
-// Test copy with bf16 dtype support
-TEST_F(AOTITorchCopyTest, CopyBf16Tensors) {
-  // Test that bf16 tensors can be created and copied
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<float> src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+// ============================================================================
+// CUDA Tests
+// ============================================================================
 
-  // Note: We create float32 data but the tensor will be created with bf16 dtype
-  // This simulates creating bf16 tensors
-  Tensor* src = create_test_tensor_with_data(
+TEST_F(AOTITorchCopySlimTest, CudaToCuda) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  std::vector<int64_t> sizes = {3, 4};
+
+  std::vector<float> host_src_data(12);
+  for (size_t i = 0; i < host_src_data.size(); i++) {
+    host_src_data[i] = static_cast<float>(i + 1);
+  }
+
+  Tensor* src = createTestTensor(
       sizes,
-      src_data,
-      {}, // default strides
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16), // bf16 dtype
-      static_cast<int32_t>(SupportedDevices::CUDA), // CUDA device
-      0 // device_index = 0
-  );
-  EXPECT_NE(src, nullptr);
-
-  // Create destination tensor with bf16 dtype
-  std::vector<float> dst_init(6, 0.0f);
-  Tensor* dst = create_test_tensor_with_data(
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(src, nullptr);
+  EXPECT_TRUE(src->is_cuda());
+
+  cudaMemcpy(
+      src->data_ptr(),
+      host_src_data.data(),
+      host_src_data.size() * sizeof(float),
+      cudaMemcpyHostToDevice);
+
+  Tensor* dst = createTestTensor(
       sizes,
-      dst_init,
-      {}, // default strides
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16), // bf16 dtype
-      static_cast<int32_t>(SupportedDevices::CUDA), // CUDA device
-      0 // device_index = 0
-  );
-  EXPECT_NE(dst, nullptr);
-
-  // Perform copy between bf16 tensors
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(dst, nullptr);
+  EXPECT_TRUE(dst->is_cuda());
+
   AOTITorchError error = aoti_torch_copy_(dst, src, 0);
   EXPECT_EQ(error, Error::Ok);
 
-  // Verify that both tensors have the expected dtype
-  int32_t src_dtype, dst_dtype;
-  aoti_torch_get_dtype(src, &src_dtype);
-  aoti_torch_get_dtype(dst, &dst_dtype);
+  std::vector<float> host_dst_data(12);
+  cudaMemcpy(
+      host_dst_data.data(),
+      dst->data_ptr(),
+      host_dst_data.size() * sizeof(float),
+      cudaMemcpyDeviceToHost);
 
-  EXPECT_EQ(src_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16));
-  EXPECT_EQ(dst_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16));
+  for (size_t i = 0; i < host_dst_data.size(); i++) {
+    EXPECT_FLOAT_EQ(host_dst_data[i], static_cast<float>(i + 1));
+  }
 
-  // Verify copy was successful by checking numel matches
-  EXPECT_EQ(src->numel(), dst->numel());
-  EXPECT_EQ(src->numel(), 6);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
 }
 
-// Test copy between different dtypes should fail
-TEST_F(AOTITorchCopyTest, CopyDTypeMismatchError) {
-  std::vector<int64_t> sizes = {2, 2};
-  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+TEST_F(AOTITorchCopySlimTest, CpuToCuda) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
 
-  // Create float32 tensor
-  Tensor* float32_tensor = create_test_tensor_with_data(
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* src = createTestTensor(
       sizes,
-      data,
-      {}, // default strides
-      static_cast<int32_t>(SupportedDTypes::FLOAT32), // float32 dtype
-      static_cast<int32_t>(SupportedDevices::CUDA), // CUDA device
-      0 // device_index = 0
-  );
-  EXPECT_NE(float32_tensor, nullptr);
-
-  // Create bf16 tensor
-  Tensor* bf16_tensor = create_test_tensor_with_data(
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src, nullptr);
+  EXPECT_TRUE(src->is_cpu());
+
+  float* src_data = static_cast<float*>(src->data_ptr());
+  for (int64_t i = 0; i < src->numel(); i++) {
+    src_data[i] = static_cast<float>(i * 10);
+  }
+
+  Tensor* dst = createTestTensor(
       sizes,
-      {0.0f, 0.0f, 0.0f, 0.0f},
-      {}, // default strides
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16), // bf16 dtype
-      static_cast<int32_t>(SupportedDevices::CUDA), // CUDA device
-      0 // device_index = 0
-  );
-  EXPECT_NE(bf16_tensor, nullptr);
-
-  // Attempting to copy between different dtypes should fail
-  AOTITorchError error = aoti_torch_copy_(bf16_tensor, float32_tensor, 0);
-  EXPECT_EQ(error, Error::InvalidArgument);
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(dst, nullptr);
+  EXPECT_TRUE(dst->is_cuda());
 
-  // Reverse direction should also fail
-  error = aoti_torch_copy_(float32_tensor, bf16_tensor, 0);
-  EXPECT_EQ(error, Error::InvalidArgument);
+  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
+  EXPECT_EQ(error, Error::Ok);
+
+  std::vector<float> host_dst_data(6);
+  cudaMemcpy(
+      host_dst_data.data(),
+      dst->data_ptr(),
+      host_dst_data.size() * sizeof(float),
+      cudaMemcpyDeviceToHost);
+
+  for (size_t i = 0; i < host_dst_data.size(); i++) {
+    EXPECT_FLOAT_EQ(host_dst_data[i], static_cast<float>(i * 10));
+  }
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
 }
 
-// Test error conditions
-TEST_F(AOTITorchCopyTest, ErrorHandling) {
+TEST_F(AOTITorchCopySlimTest, CudaToCpu) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
   std::vector<int64_t> sizes = {2, 3};
-  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
 
-  Tensor* valid_tensor = create_test_tensor_with_data(sizes, data);
-  EXPECT_NE(valid_tensor, nullptr);
+  std::vector<float> host_src_data(6);
+  for (size_t i = 0; i < host_src_data.size(); i++) {
+    host_src_data[i] = static_cast<float>(i * 5);
+  }
 
-  // Test null pointers
-  AOTITorchError error = aoti_torch_copy_(nullptr, valid_tensor, 0);
-  EXPECT_NE(error, Error::Ok);
+  Tensor* src = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(src, nullptr);
+
+  cudaMemcpy(
+      src->data_ptr(),
+      host_src_data.data(),
+      host_src_data.size() * sizeof(float),
+      cudaMemcpyHostToDevice);
+
+  Tensor* dst = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(dst, nullptr);
+  EXPECT_TRUE(dst->is_cpu());
 
-  error = aoti_torch_copy_(valid_tensor, nullptr, 0);
-  EXPECT_NE(error, Error::Ok);
+  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
+  EXPECT_EQ(error, Error::Ok);
 
-  // Test numel mismatch (different total number of elements)
-  std::vector<int64_t> different_numel_sizes = {
-      2, 3, 4}; // 24 elements vs 6 elements
-  std::vector<float> different_data(24, 1.0f);
-  Tensor* different_numel =
-      create_test_tensor_with_data(different_numel_sizes, different_data);
-  EXPECT_NE(different_numel, nullptr);
+  float* dst_data = static_cast<float*>(dst->data_ptr());
+  for (int64_t i = 0; i < dst->numel(); i++) {
+    EXPECT_FLOAT_EQ(dst_data[i], static_cast<float>(i * 5));
+  }
 
-  error = aoti_torch_copy_(valid_tensor, different_numel, 0);
-  EXPECT_EQ(error, Error::InvalidArgument);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
 }
 
-// Test copy from 1D to 3D with same total elements
-TEST_F(AOTITorchCopyTest, Copy1DTo3DSameNumel) {
-  // Source tensor: 8 elements in 1D
-  std::vector<int64_t> src_sizes = {8};
-  std::vector<float> src_data = {
-      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+// ============================================================================
+// Non-blocking Tests
+// ============================================================================
 
-  Tensor* src = create_test_tensor_with_data(src_sizes, src_data);
-  EXPECT_NE(src, nullptr);
+TEST_F(AOTITorchCopySlimTest, NonBlockingFlag_CPU) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* src = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(src, nullptr);
+
+  float* src_data = static_cast<float*>(src->data_ptr());
+  for (int64_t i = 0; i < src->numel(); i++) {
+    src_data[i] = static_cast<float>(i);
+  }
 
-  // Destination tensor: 2x2x2 = 8 elements (different shape, same total)
-  std::vector<int64_t> dst_sizes = {2, 2, 2};
-  std::vector<float> dst_init(8, 0.0f);
-  Tensor* dst = create_test_tensor_with_data(dst_sizes, dst_init);
-  EXPECT_NE(dst, nullptr);
+  Tensor* dst = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(dst, nullptr);
 
-  // This should work - same total number of elements
-  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
+  AOTITorchError error = aoti_torch_copy_(dst, src, 1);
   EXPECT_EQ(error, Error::Ok);
 
-  // Verify the data was copied correctly
-  auto dst_data = get_tensor_data(dst);
-  EXPECT_EQ(dst_data.size(), 8);
+  float* dst_data = static_cast<float*>(dst->data_ptr());
+  for (int64_t i = 0; i < dst->numel(); i++) {
+    EXPECT_FLOAT_EQ(dst_data[i], static_cast<float>(i));
+  }
 
-  // Check some specific elements to verify correct copying
-  EXPECT_FLOAT_EQ(dst_data[0], 1.0f);
-  EXPECT_FLOAT_EQ(dst_data[7], 8.0f);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
 }
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy__slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy__slim.cpp
deleted file mode 100644
index c2e67732b41..00000000000
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy__slim.cpp
+++ /dev/null
@@ -1,487 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <vector>
-
-#include <executorch/backends/aoti/slim/c10/core/Device.h>
-#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
-#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
-
-using namespace executorch::backends::cuda;
-using executorch::runtime::Error;
-
-namespace slim_c10 = executorch::backends::aoti::slim::c10;
-
-namespace {
-
-bool isCudaAvailable() {
-  int device_count = 0;
-  cudaError_t err = cudaGetDeviceCount(&device_count);
-  return (err == cudaSuccess && device_count > 0);
-}
-
-std::vector<int64_t> calculateContiguousStrides(
-    const std::vector<int64_t>& sizes) {
-  std::vector<int64_t> strides(sizes.size());
-  if (sizes.empty()) {
-    return strides;
-  }
-  strides[sizes.size() - 1] = 1;
-  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-    strides[i] = strides[i + 1] * sizes[i + 1];
-  }
-  return strides;
-}
-
-} // namespace
-
-class AOTITorchCopySlimTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    et_pal_init();
-  }
-
-  Tensor* createTestTensor(
-      const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides = {},
-      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
-      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      int32_t device_index = 0) {
-    Tensor* tensor = nullptr;
-
-    std::vector<int64_t> effective_strides = strides;
-    if (strides.empty()) {
-      effective_strides = calculateContiguousStrides(sizes);
-    }
-
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        effective_strides.data(),
-        dtype,
-        device_type,
-        device_index,
-        &tensor);
-
-    return (error == Error::Ok) ? tensor : nullptr;
-  }
-};
-
-// ============================================================================
-// Basic Functionality Tests
-// ============================================================================
-
-TEST_F(AOTITorchCopySlimTest, BasicCopy_CPU) {
-  std::vector<int64_t> sizes = {3, 4};
-  Tensor* src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src, nullptr);
-
-  float* src_data = static_cast<float*>(src->data_ptr());
-  for (int64_t i = 0; i < src->numel(); i++) {
-    src_data[i] = static_cast<float>(i + 1);
-  }
-
-  Tensor* dst = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(dst, nullptr);
-
-  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
-  EXPECT_EQ(error, Error::Ok);
-
-  float* dst_data = static_cast<float*>(dst->data_ptr());
-  for (int64_t i = 0; i < dst->numel(); i++) {
-    EXPECT_FLOAT_EQ(dst_data[i], static_cast<float>(i + 1));
-  }
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
-}
-
-TEST_F(AOTITorchCopySlimTest, NullSelf) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src, nullptr);
-
-  AOTITorchError error = aoti_torch_copy_(nullptr, src, 0);
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
-}
-
-TEST_F(AOTITorchCopySlimTest, NullSrc) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* dst = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(dst, nullptr);
-
-  AOTITorchError error = aoti_torch_copy_(dst, nullptr, 0);
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
-}
-
-// ============================================================================
-// Different Dtype Tests
-// ============================================================================
-
-TEST_F(AOTITorchCopySlimTest, Int64Copy_CPU) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Long),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src, nullptr);
-
-  int64_t* src_data = static_cast<int64_t*>(src->data_ptr());
-  for (int64_t i = 0; i < src->numel(); i++) {
-    src_data[i] = i * 100;
-  }
-
-  Tensor* dst = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Long),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(dst, nullptr);
-
-  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
-  EXPECT_EQ(error, Error::Ok);
-
-  int64_t* dst_data = static_cast<int64_t*>(dst->data_ptr());
-  for (int64_t i = 0; i < dst->numel(); i++) {
-    EXPECT_EQ(dst_data[i], i * 100);
-  }
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
-}
-
-TEST_F(AOTITorchCopySlimTest, BoolCopy_CPU) {
-  std::vector<int64_t> sizes = {4};
-  Tensor* src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Bool),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src, nullptr);
-
-  bool* src_data = static_cast<bool*>(src->data_ptr());
-  src_data[0] = true;
-  src_data[1] = false;
-  src_data[2] = true;
-  src_data[3] = false;
-
-  Tensor* dst = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Bool),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(dst, nullptr);
-
-  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
-  EXPECT_EQ(error, Error::Ok);
-
-  bool* dst_data = static_cast<bool*>(dst->data_ptr());
-  EXPECT_EQ(dst_data[0], true);
-  EXPECT_EQ(dst_data[1], false);
-  EXPECT_EQ(dst_data[2], true);
-  EXPECT_EQ(dst_data[3], false);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
-}
-
-// ============================================================================
-// Tensor Shape Tests
-// ============================================================================
-
-TEST_F(AOTITorchCopySlimTest, ScalarTensorCopy_CPU) {
-  std::vector<int64_t> sizes = {};
-  Tensor* src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src, nullptr);
-  EXPECT_EQ(src->dim(), 0);
-  EXPECT_EQ(src->numel(), 1);
-
-  float* src_data = static_cast<float*>(src->data_ptr());
-  *src_data = 42.0f;
-
-  Tensor* dst = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(dst, nullptr);
-
-  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
-  EXPECT_EQ(error, Error::Ok);
-
-  float* dst_data = static_cast<float*>(dst->data_ptr());
-  EXPECT_FLOAT_EQ(*dst_data, 42.0f);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
-}
-
-TEST_F(AOTITorchCopySlimTest, LargeTensorCopy_CPU) {
-  std::vector<int64_t> sizes = {100, 100};
-  Tensor* src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src, nullptr);
-
-  float* src_data = static_cast<float*>(src->data_ptr());
-  for (int64_t i = 0; i < src->numel(); i++) {
-    src_data[i] = static_cast<float>(i);
-  }
-
-  Tensor* dst = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(dst, nullptr);
-
-  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
-  EXPECT_EQ(error, Error::Ok);
-
-  float* dst_data = static_cast<float*>(dst->data_ptr());
-  for (int64_t i = 0; i < dst->numel(); i++) {
-    EXPECT_FLOAT_EQ(dst_data[i], static_cast<float>(i));
-  }
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
-}
-
-// ============================================================================
-// CUDA Tests
-// ============================================================================
-
-TEST_F(AOTITorchCopySlimTest, CudaToCuda) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {3, 4};
-
-  std::vector<float> host_src_data(12);
-  for (size_t i = 0; i < host_src_data.size(); i++) {
-    host_src_data[i] = static_cast<float>(i + 1);
-  }
-
-  Tensor* src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(src, nullptr);
-  EXPECT_TRUE(src->is_cuda());
-
-  cudaMemcpy(
-      src->data_ptr(),
-      host_src_data.data(),
-      host_src_data.size() * sizeof(float),
-      cudaMemcpyHostToDevice);
-
-  Tensor* dst = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(dst, nullptr);
-  EXPECT_TRUE(dst->is_cuda());
-
-  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
-  EXPECT_EQ(error, Error::Ok);
-
-  std::vector<float> host_dst_data(12);
-  cudaMemcpy(
-      host_dst_data.data(),
-      dst->data_ptr(),
-      host_dst_data.size() * sizeof(float),
-      cudaMemcpyDeviceToHost);
-
-  for (size_t i = 0; i < host_dst_data.size(); i++) {
-    EXPECT_FLOAT_EQ(host_dst_data[i], static_cast<float>(i + 1));
-  }
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
-}
-
-TEST_F(AOTITorchCopySlimTest, CpuToCuda) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src, nullptr);
-  EXPECT_TRUE(src->is_cpu());
-
-  float* src_data = static_cast<float*>(src->data_ptr());
-  for (int64_t i = 0; i < src->numel(); i++) {
-    src_data[i] = static_cast<float>(i * 10);
-  }
-
-  Tensor* dst = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(dst, nullptr);
-  EXPECT_TRUE(dst->is_cuda());
-
-  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
-  EXPECT_EQ(error, Error::Ok);
-
-  std::vector<float> host_dst_data(6);
-  cudaMemcpy(
-      host_dst_data.data(),
-      dst->data_ptr(),
-      host_dst_data.size() * sizeof(float),
-      cudaMemcpyDeviceToHost);
-
-  for (size_t i = 0; i < host_dst_data.size(); i++) {
-    EXPECT_FLOAT_EQ(host_dst_data[i], static_cast<float>(i * 10));
-  }
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
-}
-
-TEST_F(AOTITorchCopySlimTest, CudaToCpu) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-
-  std::vector<float> host_src_data(6);
-  for (size_t i = 0; i < host_src_data.size(); i++) {
-    host_src_data[i] = static_cast<float>(i * 5);
-  }
-
-  Tensor* src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(src, nullptr);
-
-  cudaMemcpy(
-      src->data_ptr(),
-      host_src_data.data(),
-      host_src_data.size() * sizeof(float),
-      cudaMemcpyHostToDevice);
-
-  Tensor* dst = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(dst, nullptr);
-  EXPECT_TRUE(dst->is_cpu());
-
-  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
-  EXPECT_EQ(error, Error::Ok);
-
-  float* dst_data = static_cast<float*>(dst->data_ptr());
-  for (int64_t i = 0; i < dst->numel(); i++) {
-    EXPECT_FLOAT_EQ(dst_data[i], static_cast<float>(i * 5));
-  }
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
-}
-
-// ============================================================================
-// Non-blocking Tests
-// ============================================================================
-
-TEST_F(AOTITorchCopySlimTest, NonBlockingFlag_CPU) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* src = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(src, nullptr);
-
-  float* src_data = static_cast<float*>(src->data_ptr());
-  for (int64_t i = 0; i < src->numel(); i++) {
-    src_data[i] = static_cast<float>(i);
-  }
-
-  Tensor* dst = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(dst, nullptr);
-
-  AOTITorchError error = aoti_torch_copy_(dst, src, 1);
-  EXPECT_EQ(error, Error::Ok);
-
-  float* dst_data = static_cast<float*>(dst->data_ptr());
-  for (int64_t i = 0; i < dst->numel(); i++) {
-    EXPECT_FLOAT_EQ(dst_data[i], static_cast<float>(i));
-  }
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok);
-}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
index db0ab84970d..21f8c79cc46 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
@@ -7,380 +7,271 @@
  */
 
 #include <cuda_runtime.h>
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/cuda/runtime/shims/memory.h>
-#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/utils.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
 #include <vector>
 
-using namespace executorch::backends::aoti;
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
+#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+
 using namespace executorch::backends::cuda;
-using namespace executorch::runtime;
-using executorch::runtime::etensor::Tensor;
+using executorch::runtime::Error;
 
-// Test fixture for aoti_torch_create_tensor_from_blob_v2 tests
-class AOTITorchCreateTensorFromBlobV2Test : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Initialize ExecuTorch Platform Abstraction Layer
-    et_pal_init();
+namespace slim_c10 = executorch::backends::aoti::slim::c10;
 
-    // Check if CUDA is available
-    int device_count = 0;
-    cudaError_t err = cudaGetDeviceCount(&device_count);
-    if (err != cudaSuccess || device_count == 0) {
-      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
-    }
+namespace {
+
+// Helper to check if CUDA is available
+bool isCudaAvailable() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  return (err == cudaSuccess && device_count > 0);
+}
+
+// Helper to calculate contiguous strides from sizes
+std::vector<int64_t> calculateContiguousStrides(
+    const std::vector<int64_t>& sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  if (sizes.empty()) {
+    return strides;
+  }
+  strides[sizes.size() - 1] = 1;
+  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * sizes[i + 1];
+  }
+  return strides;
+}
 
-    // Clean up any existing cached metadata before each test
-    cleanup_tensor_metadata();
+// Helper to calculate numel from sizes
+int64_t calculateNumel(const std::vector<int64_t>& sizes) {
+  int64_t numel = 1;
+  for (int64_t size : sizes) {
+    numel *= size;
+  }
+  return numel;
+}
 
-    // Clear any remaining tensors from previous tests
-    clear_all_tensors();
+} // namespace
+
+// Test fixture for SlimTensor-based aoti_torch_create_tensor_from_blob_v2 tests
+class AOTITorchCreateTensorFromBlobV2SlimTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
   }
 
   void TearDown() override {
-    // Clean up metadata
-    cleanup_tensor_metadata();
-
-    // Clear the global tensor storage using the provided function
-    clear_all_tensors();
-
-    // Clean up any allocated memory buffers
-    for (void* ptr : cuda_memory_buffers_) {
-      if (ptr) {
-        cudaError_t cuda_err = cudaFree(ptr);
-        EXPECT_EQ(cuda_err, cudaSuccess)
-            << "Failed to free CUDA memory: " << cudaGetErrorString(cuda_err);
+    // Clean up tensors
+    for (Tensor* t : tensors_) {
+      delete t;
+    }
+    tensors_.clear();
+
+    // Clean up CUDA memory
+    for (void* ptr : cuda_memory_) {
+      if (ptr != nullptr) {
+        cudaFree(ptr);
       }
     }
-    cuda_memory_buffers_.clear();
+    cuda_memory_.clear();
 
-    for (void* ptr : cpu_memory_buffers_) {
-      if (ptr) {
+    // Clean up CPU memory
+    for (void* ptr : cpu_memory_) {
+      if (ptr != nullptr) {
         free(ptr);
       }
     }
-    cpu_memory_buffers_.clear();
+    cpu_memory_.clear();
   }
 
-  // Helper to allocate CUDA memory and track it for cleanup
-  void* allocate_cuda_memory(size_t bytes) {
-    void* ptr;
-    cudaError_t err = cudaMallocManaged(&ptr, bytes);
-    if (err == cudaSuccess) {
-      cuda_memory_buffers_.push_back(ptr);
-      return ptr;
+  void* allocateCudaMemory(size_t bytes) {
+    void* ptr = nullptr;
+    cudaError_t err = cudaMalloc(&ptr, bytes);
+    if (err == cudaSuccess && ptr != nullptr) {
+      cuda_memory_.push_back(ptr);
     }
-    return nullptr;
+    return ptr;
   }
 
-  // Helper to allocate CPU memory and track it for cleanup
-  void* allocate_cpu_memory(size_t bytes) {
-    void* ptr;
-    int result = posix_memalign(&ptr, 16, bytes); // 16-byte aligned
+  void* allocateCpuMemory(size_t bytes) {
+    void* ptr = nullptr;
+    int result = posix_memalign(&ptr, 16, bytes);
     if (result == 0 && ptr != nullptr) {
-      cpu_memory_buffers_.push_back(ptr);
-      return ptr;
+      cpu_memory_.push_back(ptr);
     }
-    return nullptr;
+    return ptr;
   }
 
-  // Helper to calculate number of elements from sizes
-  int64_t calculate_numel(const std::vector<int64_t>& sizes) {
-    int64_t numel = 1;
-    for (int64_t size : sizes) {
-      numel *= size;
+  void trackTensor(Tensor* t) {
+    if (t != nullptr) {
+      tensors_.push_back(t);
     }
-    return numel;
-  }
-
-  // Helper to calculate contiguous strides from sizes
-  std::vector<int64_t> calculate_contiguous_strides(
-      const std::vector<int64_t>& sizes) {
-    std::vector<int64_t> strides(sizes.size());
-    if (sizes.empty()) {
-      return strides;
-    }
-
-    strides[sizes.size() - 1] = 1;
-    // Use int64_t and check for underflow to avoid unsigned integer wraparound
-    for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-      strides[i] = strides[i + 1] * sizes[i + 1];
-    }
-    return strides;
   }
 
  private:
-  std::vector<void*> cuda_memory_buffers_;
-  std::vector<void*> cpu_memory_buffers_;
+  std::vector<Tensor*> tensors_;
+  std::vector<void*> cuda_memory_;
+  std::vector<void*> cpu_memory_;
 };
 
-// Test basic functionality with CUDA memory
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, BasicFunctionalityCUDA) {
-  // Test 1D tensor
-  std::vector<int64_t> sizes_1d = {5};
-  std::vector<int64_t> strides_1d = calculate_contiguous_strides(sizes_1d);
-
-  // Allocate CUDA memory
-  size_t bytes = calculate_numel(sizes_1d) * sizeof(float);
-  void* cuda_data = allocate_cuda_memory(bytes);
-  ASSERT_NE(cuda_data, nullptr);
-
-  Tensor* tensor_1d;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      cuda_data,
-      sizes_1d.size(),
-      sizes_1d.data(),
-      strides_1d.data(),
-      0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
-      &tensor_1d,
-      0, // layout (strided)
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor_1d, nullptr);
-
-  // Check tensor properties
-  EXPECT_EQ(tensor_1d->dim(), 1);
-  EXPECT_EQ(tensor_1d->size(0), 5);
+// ============================================================================
+// Common test body - parameterized by device type
+// ============================================================================
 
-  // Verify the tensor uses the same data pointer
-  void* tensor_data = tensor_1d->mutable_data_ptr();
-  EXPECT_EQ(tensor_data, cuda_data);
-
-  // Delete the tensor - this should NOT free the original memory
-  error = aoti_torch_delete_tensor_object(tensor_1d);
-  EXPECT_EQ(error, Error::Ok);
-
-  // Test that the original memory is still accessible (proves tensor didn't own
-  // it) For CUDA memory, check that we can still access it (synchronously)
-  // after tensor deletion
-  float pattern_value = 42.0f;
-  cudaError_t cuda_err = cudaMemcpy(
-      cuda_data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice);
-  EXPECT_EQ(cuda_err, cudaSuccess)
-      << "Should be able to write to original CUDA memory after tensor deletion";
-
-  float readback_value = 0.0f;
-  cuda_err = cudaMemcpy(
-      &readback_value, cuda_data, sizeof(float), cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess)
-      << "Should be able to read from original CUDA memory after tensor deletion";
-  EXPECT_EQ(readback_value, pattern_value)
-      << "Original CUDA memory should still contain our test pattern";
-}
-
-// Test basic functionality with CPU memory
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, BasicFunctionalityCPU) {
-  // Test 2D tensor
-  std::vector<int64_t> sizes_2d = {3, 4};
-  std::vector<int64_t> strides_2d = calculate_contiguous_strides(sizes_2d);
-
-  // Allocate CPU memory
-  size_t bytes = calculate_numel(sizes_2d) * sizeof(float);
-  void* cpu_data = allocate_cpu_memory(bytes);
-  ASSERT_NE(cpu_data, nullptr);
+void runBasicFromBlobTest(
+    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
+    void* data,
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
 
-  Tensor* tensor_2d;
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      cpu_data,
-      sizes_2d.size(),
-      sizes_2d.data(),
-      strides_2d.data(),
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
       0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CPU),
-      0, // device index
-      &tensor_2d,
-      0, // layout (strided)
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
+      &tensor,
+      0, // layout
       nullptr, // opaque_metadata
       0); // opaque_metadata_size
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor_2d, nullptr);
+  ASSERT_NE(tensor, nullptr);
 
   // Check tensor properties
-  EXPECT_EQ(tensor_2d->dim(), 2);
-  EXPECT_EQ(tensor_2d->size(0), 3);
-  EXPECT_EQ(tensor_2d->size(1), 4);
-
-  // Verify the tensor uses the same data pointer
-  void* tensor_data = tensor_2d->mutable_data_ptr();
-  EXPECT_EQ(tensor_data, cpu_data);
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->numel(), 6);
+  EXPECT_EQ(
+      static_cast<int32_t>(tensor->dtype()),
+      static_cast<int32_t>(slim_c10::ScalarType::Float));
 
-  // Delete the tensor - this should NOT free the original memory
-  error = aoti_torch_delete_tensor_object(tensor_2d);
-  EXPECT_EQ(error, Error::Ok);
+  // Verify the tensor uses the same data pointer (non-owning)
+  EXPECT_EQ(tensor->data_ptr(), data);
 
-  // Test that the original memory is still accessible (proves tensor didn't own
-  // it) For CPU memory, directly write and read to verify accessibility
-  float* float_ptr = reinterpret_cast<float*>(cpu_data);
-  float pattern_value = 42.0f;
-  *float_ptr = pattern_value;
-  EXPECT_EQ(*float_ptr, pattern_value)
-      << "Original CPU memory should still be accessible after tensor deletion";
+  // Cleanup - tensor should NOT free the original memory
+  delete tensor;
 }
 
-// Test with invalid dtype
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, InvalidDtype) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
-
-  size_t bytes = calculate_numel(sizes) * sizeof(float);
-  void* data = allocate_cuda_memory(bytes);
-  ASSERT_NE(data, nullptr);
+void runScalarFromBlobTest(
+    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
+    void* data,
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {}; // 0D tensor
+  std::vector<int64_t> strides = {};
 
-  Tensor* tensor;
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
       data,
       sizes.size(),
       sizes.data(),
       strides.data(),
       0, // storage_offset
-      999, // invalid dtype
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
       &tensor,
       0, // layout
       nullptr, // opaque_metadata
       0); // opaque_metadata_size
 
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
-
-// Test with non-zero storage offset (should fail since from_blob cannot handle
-// offsets)
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, NonZeroStorageOffset) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
-
-  size_t bytes = calculate_numel(sizes) * sizeof(float);
-  void* data = allocate_cuda_memory(bytes);
-  ASSERT_NE(data, nullptr);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
 
-  Tensor* tensor;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      1, // non-zero storage_offset (should fail since from_blob cannot handle
-         // offsets)
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
-      &tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
+  EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->numel(), 1);
+  EXPECT_EQ(tensor->data_ptr(), data);
 
-  EXPECT_EQ(error, Error::InvalidArgument);
+  delete tensor;
 }
 
-// Test with custom strides (using stride parameter but still contiguous)
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, CustomContiguousStrides) {
-  std::vector<int64_t> sizes = {2, 3};
-  // Use the correct contiguous strides but pass them explicitly
-  std::vector<int64_t> contiguous_strides = {3, 1}; // Proper contiguous strides
-
-  size_t bytes = calculate_numel(sizes) * sizeof(float);
-  void* data = allocate_cuda_memory(bytes);
-  ASSERT_NE(data, nullptr);
+void runMultiDimensionalFromBlobTest(
+    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
+    void* data,
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {2, 3, 4};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
 
-  Tensor* tensor;
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
       data,
       sizes.size(),
       sizes.data(),
-      contiguous_strides.data(), // Explicitly pass contiguous strides
+      strides.data(),
       0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
       &tensor,
       0, // layout
       nullptr, // opaque_metadata
       0); // opaque_metadata_size
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor, nullptr);
+  ASSERT_NE(tensor, nullptr);
 
-  // Check tensor properties
-  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->dim(), 3);
   EXPECT_EQ(tensor->size(0), 2);
   EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->size(2), 4);
+  EXPECT_EQ(tensor->numel(), 24);
+  EXPECT_EQ(tensor->data_ptr(), data);
 
-  // Verify the tensor uses the same data pointer
-  void* tensor_data = tensor->mutable_data_ptr();
-  EXPECT_EQ(tensor_data, data);
-
-  // Verify strides were properly set (we can check via aoti_torch_get_strides)
-  int64_t* tensor_strides;
-  error = aoti_torch_get_strides(tensor, &tensor_strides);
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(tensor_strides[0], 3);
-  EXPECT_EQ(tensor_strides[1], 1);
-
-  // Delete the tensor - this should NOT free the original memory
-  error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
-
-  // Test that the original memory is still accessible (proves tensor didn't own
-  // it)
-  float pattern_value = 42.0f;
-  cudaError_t cuda_err =
-      cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice);
-  EXPECT_EQ(cuda_err, cudaSuccess)
-      << "Should be able to write to original CUDA memory after tensor deletion";
-
-  float readback_value = 0.0f;
-  cuda_err =
-      cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess)
-      << "Should be able to read from original CUDA memory after tensor deletion";
-  EXPECT_EQ(readback_value, pattern_value)
-      << "Original CUDA memory should still contain our test pattern";
+  delete tensor;
 }
 
-// Test with null data pointer
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, NullDataPointer) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+void runCustomStridesFromBlobTest(
+    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
+    void* data,
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {3, 4};
+  std::vector<int64_t> strides = {1, 3}; // Column-major
 
-  Tensor* tensor;
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      nullptr, // null data pointer
+      data,
       sizes.size(),
       sizes.data(),
       strides.data(),
       0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
       &tensor,
       0, // layout
       nullptr, // opaque_metadata
       0); // opaque_metadata_size
 
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
 
-// Test scalar tensor (0D)
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, ScalarTensor) {
-  std::vector<int64_t> sizes = {}; // 0D tensor
-  std::vector<int64_t> strides = {}; // Empty strides for scalar
+  EXPECT_EQ(tensor->stride(0), 1);
+  EXPECT_EQ(tensor->stride(1), 3);
+  EXPECT_FALSE(tensor->is_contiguous());
+  EXPECT_EQ(tensor->data_ptr(), data);
 
-  size_t bytes = sizeof(float); // Single element
-  void* data = allocate_cuda_memory(bytes);
-  ASSERT_NE(data, nullptr);
+  delete tensor;
+}
+
+void runStorageOffsetFromBlobTest(
+    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
+    void* data,
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {2, 2};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
 
   Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
@@ -388,10 +279,10 @@ TEST_F(AOTITorchCreateTensorFromBlobV2Test, ScalarTensor) {
       sizes.size(),
       sizes.data(),
       strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
+      2, // storage_offset = 2 elements
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
       &tensor,
       0, // layout
       nullptr, // opaque_metadata
@@ -400,420 +291,343 @@ TEST_F(AOTITorchCreateTensorFromBlobV2Test, ScalarTensor) {
   EXPECT_EQ(error, Error::Ok);
   ASSERT_NE(tensor, nullptr);
 
-  // Check tensor properties
-  EXPECT_EQ(tensor->dim(), 0);
-
-  // Verify the tensor uses the same data pointer
-  void* tensor_data = tensor->mutable_data_ptr();
-  EXPECT_EQ(tensor_data, data);
-
-  // Delete the tensor - this should NOT free the original memory
-  error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(tensor->storage_offset(), 2);
+  // data_ptr should point to base + offset * itemsize
+  char* expected_ptr = static_cast<char*>(data) + 2 * sizeof(float);
+  EXPECT_EQ(tensor->data_ptr(), expected_ptr);
 
-  // Test that the original memory is still accessible (proves tensor didn't own
-  // it)
-  float pattern_value = 42.0f;
-  cudaError_t cuda_err =
-      cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice);
-  EXPECT_EQ(cuda_err, cudaSuccess)
-      << "Should be able to write to original CUDA memory after tensor deletion";
-
-  float readback_value = 0.0f;
-  cuda_err =
-      cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess)
-      << "Should be able to read from original CUDA memory after tensor deletion";
-  EXPECT_EQ(readback_value, pattern_value)
-      << "Original CUDA memory should still contain our test pattern";
+  delete tensor;
 }
 
-// Test zero-sized tensor
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, ZeroSizedTensor) {
-  std::vector<int64_t> sizes = {0, 5}; // Zero elements
-  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+// ============================================================================
+// CPU Tests
+// ============================================================================
 
-  // Even for zero-sized tensor, we need some memory allocated
-  size_t bytes = sizeof(float); // Minimum allocation
-  void* data = allocate_cuda_memory(bytes);
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CPU) {
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
   ASSERT_NE(data, nullptr);
 
-  Tensor* tensor;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
-      &tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
+  runBasicFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor, nullptr);
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CPU) {
+  size_t bytes = sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
 
-  // Check tensor properties
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 0);
-  EXPECT_EQ(tensor->size(1), 5);
+  runScalarFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
 
-  // Verify the tensor uses the same data pointer
-  void* tensor_data = tensor->mutable_data_ptr();
-  EXPECT_EQ(tensor_data, data);
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CPU) {
+  size_t bytes = 24 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
 
-  // Delete the tensor - this should NOT free the original memory
-  error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
+  runMultiDimensionalFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
 
-  // Test that the original memory is still accessible (proves tensor didn't own
-  // it)
-  float pattern_value = 42.0f;
-  cudaError_t cuda_err =
-      cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice);
-  EXPECT_EQ(cuda_err, cudaSuccess)
-      << "Should be able to write to original CUDA memory after tensor deletion";
-
-  float readback_value = 0.0f;
-  cuda_err =
-      cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess)
-      << "Should be able to read from original CUDA memory after tensor deletion";
-  EXPECT_EQ(readback_value, pattern_value)
-      << "Original CUDA memory should still contain our test pattern";
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CPU) {
+  size_t bytes = 12 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runCustomStridesFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
 }
 
-// Test multi-dimensional tensors
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, MultiDimensionalTensors) {
-  // Test 3D tensor
-  std::vector<int64_t> sizes_3d = {2, 3, 4};
-  std::vector<int64_t> strides_3d = calculate_contiguous_strides(sizes_3d);
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CPU) {
+  // Allocate extra space for offset
+  size_t bytes = 6 * sizeof(float); // 2 for offset + 4 for tensor
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
 
-  size_t bytes_3d = calculate_numel(sizes_3d) * sizeof(float);
-  void* data_3d = allocate_cuda_memory(bytes_3d);
-  ASSERT_NE(data_3d, nullptr);
+  runStorageOffsetFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
 
-  Tensor* tensor_3d;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data_3d,
-      sizes_3d.size(),
-      sizes_3d.data(),
-      strides_3d.data(),
-      0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
-      &tensor_3d,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
+// ============================================================================
+// CUDA Tests
+// ============================================================================
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor_3d, nullptr);
-  EXPECT_EQ(tensor_3d->dim(), 3);
-  EXPECT_EQ(tensor_3d->size(0), 2);
-  EXPECT_EQ(tensor_3d->size(1), 3);
-  EXPECT_EQ(tensor_3d->size(2), 4);
-
-  // Test 4D tensor
-  std::vector<int64_t> sizes_4d = {2, 3, 4, 5};
-  std::vector<int64_t> strides_4d = calculate_contiguous_strides(sizes_4d);
-
-  size_t bytes_4d = calculate_numel(sizes_4d) * sizeof(float);
-  void* data_4d = allocate_cuda_memory(bytes_4d);
-  ASSERT_NE(data_4d, nullptr);
-
-  Tensor* tensor_4d;
-  error = aoti_torch_create_tensor_from_blob_v2(
-      data_4d,
-      sizes_4d.size(),
-      sizes_4d.data(),
-      strides_4d.data(),
-      0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
-      &tensor_4d,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor_4d, nullptr);
-  EXPECT_EQ(tensor_4d->dim(), 4);
-  EXPECT_EQ(tensor_4d->size(0), 2);
-  EXPECT_EQ(tensor_4d->size(1), 3);
-  EXPECT_EQ(tensor_4d->size(2), 4);
-  EXPECT_EQ(tensor_4d->size(3), 5);
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runBasicFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
 }
 
-// Test tensor data pointer consistency
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, DataPointerConsistency) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
 
-  size_t bytes = calculate_numel(sizes) * sizeof(float);
-  void* original_data = allocate_cuda_memory(bytes);
-  ASSERT_NE(original_data, nullptr);
+  size_t bytes = sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
 
-  Tensor* tensor;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      original_data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
-      &tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
+  runScalarFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
+}
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor, nullptr);
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  size_t bytes = 24 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
 
-  // Check that the tensor uses the same data pointer
-  void* tensor_data = tensor->mutable_data_ptr();
-  EXPECT_EQ(tensor_data, original_data);
+  runMultiDimensionalFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
 }
 
-// Test creating multiple tensors from different blobs
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, MultipleTensorsFromBlobs) {
-  const int num_tensors = 5;
-  std::vector<Tensor*> tensors;
-  std::vector<void*> data_ptrs;
-
-  for (int i = 0; i < num_tensors; i++) {
-    std::vector<int64_t> sizes = {i + 1, i + 2};
-    std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
-
-    size_t bytes = calculate_numel(sizes) * sizeof(float);
-    void* data = allocate_cuda_memory(bytes);
-    ASSERT_NE(data, nullptr);
-    data_ptrs.push_back(data);
-
-    Tensor* tensor;
-    AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-        data,
-        sizes.size(),
-        sizes.data(),
-        strides.data(),
-        0, // storage_offset
-        static_cast<int32_t>(SupportedDTypes::FLOAT32),
-        static_cast<int32_t>(SupportedDevices::CUDA),
-        0, // device index
-        &tensor,
-        0, // layout
-        nullptr, // opaque_metadata
-        0); // opaque_metadata_size
-
-    EXPECT_EQ(error, Error::Ok);
-    EXPECT_NE(tensor, nullptr);
-    tensors.push_back(tensor);
-
-    // Verify dimensions
-    EXPECT_EQ(tensor->dim(), 2);
-    EXPECT_EQ(tensor->size(0), i + 1);
-    EXPECT_EQ(tensor->size(1), i + 2);
-
-    // Verify the tensor uses the correct data pointer
-    EXPECT_EQ(tensor->mutable_data_ptr(), data);
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
   }
 
-  // Verify all tensors have different data pointers
-  for (int i = 0; i < num_tensors; i++) {
-    EXPECT_EQ(tensors[i]->mutable_data_ptr(), data_ptrs[i]);
-    for (int j = i + 1; j < num_tensors; j++) {
-      EXPECT_NE(tensors[i]->mutable_data_ptr(), tensors[j]->mutable_data_ptr());
-    }
+  size_t bytes = 12 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runCustomStridesFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
   }
+
+  // Allocate extra space for offset
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runStorageOffsetFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
 }
 
-// Test deletion of tensor created from blob (should not free the original
-// memory)
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeletionDoesNotFreeOriginalMemory) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+// ============================================================================
+// Verify Non-Owning Behavior
+// ============================================================================
 
-  size_t bytes = calculate_numel(sizes) * sizeof(float);
-  void* data = allocate_cuda_memory(bytes);
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CPU) {
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
   ASSERT_NE(data, nullptr);
 
-  Tensor* tensor;
+  // Write a pattern
+  float* float_data = static_cast<float*>(data);
+  float_data[0] = 42.0f;
+
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
       data,
       sizes.size(),
       sizes.data(),
       strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0,
       &tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
+      0,
+      nullptr,
+      0);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor, nullptr);
+  ASSERT_NE(tensor, nullptr);
 
-  // Delete the tensor - this should NOT free the original memory
-  error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
+  // Delete tensor - memory should NOT be freed
+  delete tensor;
+  tensor = nullptr;
 
-  // The original memory should still be valid (we'll free it in teardown)
-  // We can't easily test if the memory is still valid without risking crashes,
-  // but the test should pass without issues if memory management is correct
+  // Memory should still be accessible
+  EXPECT_FLOAT_EQ(float_data[0], 42.0f);
 }
 
-// Test with opaque metadata
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, WithOpaqueMetadata) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
 
-  size_t bytes = calculate_numel(sizes) * sizeof(float);
-  void* data = allocate_cuda_memory(bytes);
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
   ASSERT_NE(data, nullptr);
 
-  // Create some opaque metadata
-  std::vector<uint8_t> metadata = {0x01, 0x02, 0x03, 0x04};
+  // Write a pattern
+  float pattern = 42.0f;
+  cudaMemcpy(data, &pattern, sizeof(float), cudaMemcpyHostToDevice);
 
-  Tensor* tensor;
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
       data,
       sizes.size(),
       sizes.data(),
       strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0,
       &tensor,
-      0, // layout
-      metadata.data(), // opaque_metadata
-      metadata.size()); // opaque_metadata_size
+      0,
+      nullptr,
+      0);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor, nullptr);
+  ASSERT_NE(tensor, nullptr);
 
-  // Check tensor properties
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 2);
-  EXPECT_EQ(tensor->size(1), 3);
-}
+  // Delete tensor - memory should NOT be freed
+  delete tensor;
+  tensor = nullptr;
 
-// Test stress test with many small tensors from blobs
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, StressTestManySmallTensors) {
-  const int num_tensors = 50; // Reduced for reasonable test time
-  std::vector<Tensor*> tensors;
+  // Memory should still be accessible
+  float readback = 0.0f;
+  cudaError_t cuda_err =
+      cudaMemcpy(&readback, data, sizeof(float), cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+  EXPECT_FLOAT_EQ(readback, 42.0f);
+}
 
-  for (int i = 0; i < num_tensors; i++) {
-    std::vector<int64_t> sizes = {1, 1}; // Minimal size
-    std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+// ============================================================================
+// Error Cases
+// ============================================================================
 
-    size_t bytes = calculate_numel(sizes) * sizeof(float);
-    void* data = allocate_cuda_memory(bytes);
-    if (data == nullptr) {
-      // Skip if we run out of memory
-      continue;
-    }
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullDataPointer) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
 
-    Tensor* tensor;
-    AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-        data,
-        sizes.size(),
-        sizes.data(),
-        strides.data(),
-        0, // storage_offset
-        static_cast<int32_t>(SupportedDTypes::FLOAT32),
-        static_cast<int32_t>(SupportedDevices::CUDA),
-        0, // device index
-        &tensor,
-        0, // layout
-        nullptr, // opaque_metadata
-        0); // opaque_metadata_size
-
-    if (error == Error::Ok && tensor != nullptr) {
-      tensors.push_back(tensor);
-
-      // Verify the tensor uses the correct data pointer
-      EXPECT_EQ(tensor->mutable_data_ptr(), data);
-    }
-  }
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      nullptr, // null data
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0,
+      &tensor,
+      0,
+      nullptr,
+      0);
 
-  // Delete all created tensors
-  for (Tensor* tensor : tensors) {
-    AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-    EXPECT_EQ(error, Error::Ok);
-  }
+  EXPECT_EQ(error, Error::InvalidArgument);
 }
 
-// Test device type mismatch: CPU data with CUDA device request should fail
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeviceMismatchCPUDataCUDADevice) {
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullReturnPointer) {
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
   std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0,
+      nullptr, // null return pointer
+      0,
+      nullptr,
+      0);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
 
-  // Allocate CPU memory
-  size_t bytes = calculate_numel(sizes) * sizeof(float);
-  void* cpu_data = allocate_cpu_memory(bytes);
-  ASSERT_NE(cpu_data, nullptr);
+// ============================================================================
+// Verify Device Properties
+// ============================================================================
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCPUDevice) {
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
 
-  Tensor* tensor;
-  // Request CUDA device but provide CPU memory - should fail
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      cpu_data,
+      data,
       sizes.size(),
       sizes.data(),
       strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA), // Request CUDA
-      0, // device index
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0,
       &tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
+      0,
+      nullptr,
+      0);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  EXPECT_TRUE(tensor->is_cpu());
+  EXPECT_FALSE(tensor->is_cuda());
+  EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CPU);
 
-  EXPECT_EQ(error, Error::InvalidArgument)
-      << "Should fail when CPU data is provided but CUDA device is requested";
+  delete tensor;
 }
 
-// Test device type mismatch: CUDA data with CPU device request should fail
-TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeviceMismatchCUDADataCPUDevice) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCUDADevice) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
 
-  // Allocate CUDA memory (device memory, not managed)
-  size_t bytes = calculate_numel(sizes) * sizeof(float);
-  void* cuda_data = nullptr;
-  cudaError_t cuda_err = cudaMalloc(&cuda_data, bytes);
-  ASSERT_EQ(cuda_err, cudaSuccess);
-  ASSERT_NE(cuda_data, nullptr);
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
 
-  Tensor* tensor;
-  // Request CPU device but provide CUDA memory - should fail
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      cuda_data,
+      data,
       sizes.size(),
       sizes.data(),
       strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CPU), // Request CPU
-      0, // device index
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0,
       &tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
+      0,
+      nullptr,
+      0);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
 
-  EXPECT_EQ(error, Error::InvalidArgument)
-      << "Should fail when CUDA data is provided but CPU device is requested";
+  EXPECT_FALSE(tensor->is_cpu());
+  EXPECT_TRUE(tensor->is_cuda());
+  EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CUDA);
 
-  // Clean up the CUDA memory we allocated directly
-  cudaFree(cuda_data);
+  delete tensor;
 }
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp
deleted file mode 100644
index 21f8c79cc46..00000000000
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp
+++ /dev/null
@@ -1,633 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <vector>
-
-#include <executorch/backends/aoti/slim/c10/core/Device.h>
-#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
-#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
-
-using namespace executorch::backends::cuda;
-using executorch::runtime::Error;
-
-namespace slim_c10 = executorch::backends::aoti::slim::c10;
-
-namespace {
-
-// Helper to check if CUDA is available
-bool isCudaAvailable() {
-  int device_count = 0;
-  cudaError_t err = cudaGetDeviceCount(&device_count);
-  return (err == cudaSuccess && device_count > 0);
-}
-
-// Helper to calculate contiguous strides from sizes
-std::vector<int64_t> calculateContiguousStrides(
-    const std::vector<int64_t>& sizes) {
-  std::vector<int64_t> strides(sizes.size());
-  if (sizes.empty()) {
-    return strides;
-  }
-  strides[sizes.size() - 1] = 1;
-  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-    strides[i] = strides[i + 1] * sizes[i + 1];
-  }
-  return strides;
-}
-
-// Helper to calculate numel from sizes
-int64_t calculateNumel(const std::vector<int64_t>& sizes) {
-  int64_t numel = 1;
-  for (int64_t size : sizes) {
-    numel *= size;
-  }
-  return numel;
-}
-
-} // namespace
-
-// Test fixture for SlimTensor-based aoti_torch_create_tensor_from_blob_v2 tests
-class AOTITorchCreateTensorFromBlobV2SlimTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    et_pal_init();
-  }
-
-  void TearDown() override {
-    // Clean up tensors
-    for (Tensor* t : tensors_) {
-      delete t;
-    }
-    tensors_.clear();
-
-    // Clean up CUDA memory
-    for (void* ptr : cuda_memory_) {
-      if (ptr != nullptr) {
-        cudaFree(ptr);
-      }
-    }
-    cuda_memory_.clear();
-
-    // Clean up CPU memory
-    for (void* ptr : cpu_memory_) {
-      if (ptr != nullptr) {
-        free(ptr);
-      }
-    }
-    cpu_memory_.clear();
-  }
-
-  void* allocateCudaMemory(size_t bytes) {
-    void* ptr = nullptr;
-    cudaError_t err = cudaMalloc(&ptr, bytes);
-    if (err == cudaSuccess && ptr != nullptr) {
-      cuda_memory_.push_back(ptr);
-    }
-    return ptr;
-  }
-
-  void* allocateCpuMemory(size_t bytes) {
-    void* ptr = nullptr;
-    int result = posix_memalign(&ptr, 16, bytes);
-    if (result == 0 && ptr != nullptr) {
-      cpu_memory_.push_back(ptr);
-    }
-    return ptr;
-  }
-
-  void trackTensor(Tensor* t) {
-    if (t != nullptr) {
-      tensors_.push_back(t);
-    }
-  }
-
- private:
-  std::vector<Tensor*> tensors_;
-  std::vector<void*> cuda_memory_;
-  std::vector<void*> cpu_memory_;
-};
-
-// ============================================================================
-// Common test body - parameterized by device type
-// ============================================================================
-
-void runBasicFromBlobTest(
-    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
-    void* data,
-    int32_t device_type,
-    int32_t device_index) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      device_type,
-      device_index,
-      &tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  // Check tensor properties
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 2);
-  EXPECT_EQ(tensor->size(1), 3);
-  EXPECT_EQ(tensor->numel(), 6);
-  EXPECT_EQ(
-      static_cast<int32_t>(tensor->dtype()),
-      static_cast<int32_t>(slim_c10::ScalarType::Float));
-
-  // Verify the tensor uses the same data pointer (non-owning)
-  EXPECT_EQ(tensor->data_ptr(), data);
-
-  // Cleanup - tensor should NOT free the original memory
-  delete tensor;
-}
-
-void runScalarFromBlobTest(
-    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
-    void* data,
-    int32_t device_type,
-    int32_t device_index) {
-  std::vector<int64_t> sizes = {}; // 0D tensor
-  std::vector<int64_t> strides = {};
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      device_type,
-      device_index,
-      &tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->dim(), 0);
-  EXPECT_EQ(tensor->numel(), 1);
-  EXPECT_EQ(tensor->data_ptr(), data);
-
-  delete tensor;
-}
-
-void runMultiDimensionalFromBlobTest(
-    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
-    void* data,
-    int32_t device_type,
-    int32_t device_index) {
-  std::vector<int64_t> sizes = {2, 3, 4};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      device_type,
-      device_index,
-      &tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->dim(), 3);
-  EXPECT_EQ(tensor->size(0), 2);
-  EXPECT_EQ(tensor->size(1), 3);
-  EXPECT_EQ(tensor->size(2), 4);
-  EXPECT_EQ(tensor->numel(), 24);
-  EXPECT_EQ(tensor->data_ptr(), data);
-
-  delete tensor;
-}
-
-void runCustomStridesFromBlobTest(
-    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
-    void* data,
-    int32_t device_type,
-    int32_t device_index) {
-  std::vector<int64_t> sizes = {3, 4};
-  std::vector<int64_t> strides = {1, 3}; // Column-major
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0, // storage_offset
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      device_type,
-      device_index,
-      &tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->stride(0), 1);
-  EXPECT_EQ(tensor->stride(1), 3);
-  EXPECT_FALSE(tensor->is_contiguous());
-  EXPECT_EQ(tensor->data_ptr(), data);
-
-  delete tensor;
-}
-
-void runStorageOffsetFromBlobTest(
-    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
-    void* data,
-    int32_t device_type,
-    int32_t device_index) {
-  std::vector<int64_t> sizes = {2, 2};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      2, // storage_offset = 2 elements
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      device_type,
-      device_index,
-      &tensor,
-      0, // layout
-      nullptr, // opaque_metadata
-      0); // opaque_metadata_size
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->storage_offset(), 2);
-  // data_ptr should point to base + offset * itemsize
-  char* expected_ptr = static_cast<char*>(data) + 2 * sizeof(float);
-  EXPECT_EQ(tensor->data_ptr(), expected_ptr);
-
-  delete tensor;
-}
-
-// ============================================================================
-// CPU Tests
-// ============================================================================
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CPU) {
-  size_t bytes = 6 * sizeof(float);
-  void* data = allocateCpuMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  runBasicFromBlobTest(
-      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-}
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CPU) {
-  size_t bytes = sizeof(float);
-  void* data = allocateCpuMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  runScalarFromBlobTest(
-      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-}
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CPU) {
-  size_t bytes = 24 * sizeof(float);
-  void* data = allocateCpuMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  runMultiDimensionalFromBlobTest(
-      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-}
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CPU) {
-  size_t bytes = 12 * sizeof(float);
-  void* data = allocateCpuMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  runCustomStridesFromBlobTest(
-      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-}
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CPU) {
-  // Allocate extra space for offset
-  size_t bytes = 6 * sizeof(float); // 2 for offset + 4 for tensor
-  void* data = allocateCpuMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  runStorageOffsetFromBlobTest(
-      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-}
-
-// ============================================================================
-// CUDA Tests
-// ============================================================================
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  size_t bytes = 6 * sizeof(float);
-  void* data = allocateCudaMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  runBasicFromBlobTest(
-      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-}
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  size_t bytes = sizeof(float);
-  void* data = allocateCudaMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  runScalarFromBlobTest(
-      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-}
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  size_t bytes = 24 * sizeof(float);
-  void* data = allocateCudaMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  runMultiDimensionalFromBlobTest(
-      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-}
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  size_t bytes = 12 * sizeof(float);
-  void* data = allocateCudaMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  runCustomStridesFromBlobTest(
-      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-}
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  // Allocate extra space for offset
-  size_t bytes = 6 * sizeof(float);
-  void* data = allocateCudaMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  runStorageOffsetFromBlobTest(
-      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-}
-
-// ============================================================================
-// Verify Non-Owning Behavior
-// ============================================================================
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CPU) {
-  size_t bytes = 6 * sizeof(float);
-  void* data = allocateCpuMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  // Write a pattern
-  float* float_data = static_cast<float*>(data);
-  float_data[0] = 42.0f;
-
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0,
-      &tensor,
-      0,
-      nullptr,
-      0);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  // Delete tensor - memory should NOT be freed
-  delete tensor;
-  tensor = nullptr;
-
-  // Memory should still be accessible
-  EXPECT_FLOAT_EQ(float_data[0], 42.0f);
-}
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  size_t bytes = 6 * sizeof(float);
-  void* data = allocateCudaMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  // Write a pattern
-  float pattern = 42.0f;
-  cudaMemcpy(data, &pattern, sizeof(float), cudaMemcpyHostToDevice);
-
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0,
-      &tensor,
-      0,
-      nullptr,
-      0);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  // Delete tensor - memory should NOT be freed
-  delete tensor;
-  tensor = nullptr;
-
-  // Memory should still be accessible
-  float readback = 0.0f;
-  cudaError_t cuda_err =
-      cudaMemcpy(&readback, data, sizeof(float), cudaMemcpyDeviceToHost);
-  EXPECT_EQ(cuda_err, cudaSuccess);
-  EXPECT_FLOAT_EQ(readback, 42.0f);
-}
-
-// ============================================================================
-// Error Cases
-// ============================================================================
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullDataPointer) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      nullptr, // null data
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0,
-      &tensor,
-      0,
-      nullptr,
-      0);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullReturnPointer) {
-  size_t bytes = 6 * sizeof(float);
-  void* data = allocateCpuMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0,
-      nullptr, // null return pointer
-      0,
-      nullptr,
-      0);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
-
-// ============================================================================
-// Verify Device Properties
-// ============================================================================
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCPUDevice) {
-  size_t bytes = 6 * sizeof(float);
-  void* data = allocateCpuMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0,
-      &tensor,
-      0,
-      nullptr,
-      0);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_TRUE(tensor->is_cpu());
-  EXPECT_FALSE(tensor->is_cuda());
-  EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CPU);
-
-  delete tensor;
-}
-
-TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCUDADevice) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  size_t bytes = 6 * sizeof(float);
-  void* data = allocateCudaMemory(bytes);
-  ASSERT_NE(data, nullptr);
-
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
-      data,
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      0,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0,
-      &tensor,
-      0,
-      nullptr,
-      0);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_FALSE(tensor->is_cpu());
-  EXPECT_TRUE(tensor->is_cuda());
-  EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CUDA);
-
-  delete tensor;
-}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
index 10c8d8c1a31..e88ebb3185c 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
@@ -7,64 +7,70 @@
  */
 
 #include <cuda_runtime.h>
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/cuda/runtime/shims/memory.h>
-#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/utils.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
 #include <vector>
 
-using namespace executorch::backends::aoti;
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
+#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+
 using namespace executorch::backends::cuda;
-using namespace executorch::runtime;
-using executorch::runtime::etensor::Tensor;
+using executorch::runtime::Error;
 
-// Test fixture for aoti_torch_delete_tensor_object tests
-class AOTITorchDeleteTensorObjectTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Initialize ExecuTorch Platform Abstraction Layer
-    et_pal_init();
+namespace slim_c10 = executorch::backends::aoti::slim::c10;
 
-    // Check if CUDA is available
-    int device_count = 0;
-    cudaError_t err = cudaGetDeviceCount(&device_count);
-    if (err != cudaSuccess || device_count == 0) {
-      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
-    }
+namespace {
 
-    // Clean up any existing cached metadata before each test
-    cleanup_tensor_metadata();
+bool isCudaAvailable() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  return (err == cudaSuccess && device_count > 0);
+}
 
-    // Clear any remaining tensors from previous tests
-    clear_all_tensors();
+std::vector<int64_t> calculateContiguousStrides(
+    const std::vector<int64_t>& sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  if (sizes.empty()) {
+    return strides;
+  }
+  strides[sizes.size() - 1] = 1;
+  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * sizes[i + 1];
   }
+  return strides;
+}
 
-  void TearDown() override {
-    // Clean up metadata
-    cleanup_tensor_metadata();
+} // namespace
 
-    // Clear the global tensor storage using the provided function
-    clear_all_tensors();
+class AOTITorchDeleteTensorObjectSlimTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
   }
 
-  // Helper to create test tensors
-  Tensor* create_test_tensor(
+  void TearDown() override {
+    // SlimTensor uses automatic reference counting - no manual cleanup needed
+  }
+
+  Tensor* createTestTensor(
       const std::vector<int64_t>& sizes,
       const std::vector<int64_t>& strides = {},
-      int32_t dtype = 6, // float32
-      int32_t device_type = 1, // CUDA
+      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
+      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
       int32_t device_index = 0) {
-    Tensor* tensor;
+    Tensor* tensor = nullptr;
 
-    const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data();
+    std::vector<int64_t> effective_strides = strides;
+    if (strides.empty()) {
+      effective_strides = calculateContiguousStrides(sizes);
+    }
 
     AOTITorchError error = aoti_torch_empty_strided(
         sizes.size(),
         sizes.data(),
-        strides_ptr,
+        effective_strides.data(),
         dtype,
         device_type,
         device_index,
@@ -74,254 +80,241 @@ class AOTITorchDeleteTensorObjectTest : public ::testing::Test {
   }
 };
 
-// Test basic deletion of CUDA tensor
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteCudaTensorBasic) {
-  // Create a CUDA tensor
+// ============================================================================
+// CPU Tests
+// ============================================================================
+
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteCpuTensorBasic) {
   std::vector<int64_t> sizes = {2, 3};
-  Tensor* tensor = create_test_tensor(sizes, {}, 6, 1, 0); // CUDA device
+  Tensor* tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(tensor, nullptr);
 
-  // Verify tensor properties before deletion
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 2);
   EXPECT_EQ(tensor->size(1), 3);
 
-  // Delete the tensor
   AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
   EXPECT_EQ(error, Error::Ok);
 }
 
-// Test basic deletion of CPU tensor
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteCpuTensorBasic) {
-  // Create a CPU tensor
-  std::vector<int64_t> sizes = {3, 4};
-  Tensor* tensor = create_test_tensor(sizes, {}, 6, 0, 0); // CPU device
-  ASSERT_NE(tensor, nullptr);
-
-  // Verify tensor properties before deletion
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 3);
-  EXPECT_EQ(tensor->size(1), 4);
-
-  // Delete the tensor
-  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
-}
-
-// Test deletion of null tensor pointer
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteNullTensor) {
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteNullTensor) {
   AOTITorchError error = aoti_torch_delete_tensor_object(nullptr);
   EXPECT_EQ(error, Error::InvalidArgument);
 }
 
-// Test deletion of tensor not in tracking system
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteUntrackedTensor) {
-  // Create a tensor and then clear the tracking system
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* tensor = create_test_tensor(sizes);
-  ASSERT_NE(tensor, nullptr);
-
-  // Clear the tracking system (simulating an untracked tensor)
-  clear_all_tensors();
-
-  // Try to delete the tensor - should fail
-  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
-
-// Test deletion of multiple tensors
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteMultipleTensors) {
-  // Create multiple tensors
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMultipleTensors_CPU) {
   std::vector<Tensor*> tensors;
 
   for (int i = 1; i <= 5; i++) {
     std::vector<int64_t> sizes = {i, i + 1};
-    Tensor* tensor = create_test_tensor(sizes);
+    Tensor* tensor = createTestTensor(
+        sizes,
+        {},
+        static_cast<int32_t>(slim_c10::ScalarType::Float),
+        static_cast<int32_t>(slim_c10::DeviceType::CPU),
+        0);
     ASSERT_NE(tensor, nullptr);
     tensors.push_back(tensor);
   }
 
-  // Delete all tensors
   for (Tensor* tensor : tensors) {
     AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
     EXPECT_EQ(error, Error::Ok);
   }
 }
 
-// Test deletion of zero-sized tensors
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteZeroSizedTensor) {
-  // Create a zero-sized tensor
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteZeroSizedTensor_CPU) {
   std::vector<int64_t> sizes = {0, 5};
-  Tensor* tensor = create_test_tensor(sizes);
+  Tensor* tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(tensor, nullptr);
 
-  // Verify tensor properties
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 0);
   EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->numel(), 0);
 
-  // Delete the tensor
   AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
   EXPECT_EQ(error, Error::Ok);
 }
 
-// Test deletion of scalar (0D) tensors
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteScalarTensor) {
-  // Create a scalar tensor
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteScalarTensor_CPU) {
   std::vector<int64_t> sizes = {};
-  Tensor* tensor = create_test_tensor(sizes);
+  Tensor* tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(tensor, nullptr);
 
-  // Verify tensor properties
   EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->numel(), 1);
 
-  // Delete the tensor
   AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
   EXPECT_EQ(error, Error::Ok);
 }
 
-// Test deletion of large multi-dimensional tensors
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteLargeTensor) {
-  // Create a large multi-dimensional tensor
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteLargeTensor_CPU) {
   std::vector<int64_t> sizes = {10, 20, 30};
-  Tensor* tensor = create_test_tensor(sizes);
+  Tensor* tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(tensor, nullptr);
 
-  // Verify tensor properties
   EXPECT_EQ(tensor->dim(), 3);
-  EXPECT_EQ(tensor->size(0), 10);
-  EXPECT_EQ(tensor->size(1), 20);
-  EXPECT_EQ(tensor->size(2), 30);
+  EXPECT_EQ(tensor->numel(), 6000);
 
-  // Delete the tensor
   AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
   EXPECT_EQ(error, Error::Ok);
 }
 
-// Test deletion of tensors with custom strides
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteTensorWithCustomStrides) {
-  // Create tensor with custom strides
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteTensorWithCustomStrides_CPU) {
   std::vector<int64_t> sizes = {3, 4};
-  std::vector<int64_t> strides = {4, 1}; // Row-major strides
-  Tensor* tensor = create_test_tensor(sizes, strides);
+  std::vector<int64_t> strides = {1, 3}; // Column-major
+  Tensor* tensor = createTestTensor(
+      sizes,
+      strides,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(tensor, nullptr);
 
-  // Verify tensor properties
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 3);
-  EXPECT_EQ(tensor->size(1), 4);
+  EXPECT_EQ(tensor->stride(0), 1);
+  EXPECT_EQ(tensor->stride(1), 3);
 
-  // Delete the tensor
   AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
   EXPECT_EQ(error, Error::Ok);
 }
 
-// Test deletion after accessing tensor data
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteAfterDataAccess) {
-  // Create a tensor
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteDifferentDtypes_CPU) {
   std::vector<int64_t> sizes = {2, 3};
-  Tensor* tensor = create_test_tensor(sizes);
-  ASSERT_NE(tensor, nullptr);
 
-  // Access tensor data (this should not prevent deletion)
-  void* data_ptr = tensor->mutable_data_ptr();
-  EXPECT_NE(data_ptr, nullptr);
-
-  // Delete the tensor
-  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
-}
+  // Float
+  {
+    Tensor* tensor = createTestTensor(
+        sizes,
+        {},
+        static_cast<int32_t>(slim_c10::ScalarType::Float),
+        static_cast<int32_t>(slim_c10::DeviceType::CPU),
+        0);
+    ASSERT_NE(tensor, nullptr);
+    EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
+  }
 
-// Test double deletion (should fail on second attempt)
-TEST_F(AOTITorchDeleteTensorObjectTest, DoubleDeletion) {
-  // Create a tensor
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* tensor = create_test_tensor(sizes);
-  ASSERT_NE(tensor, nullptr);
+  // BFloat16
+  {
+    Tensor* tensor = createTestTensor(
+        sizes,
+        {},
+        static_cast<int32_t>(slim_c10::ScalarType::BFloat16),
+        static_cast<int32_t>(slim_c10::DeviceType::CPU),
+        0);
+    ASSERT_NE(tensor, nullptr);
+    EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
+  }
 
-  // First deletion should succeed
-  AOTITorchError error1 = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error1, Error::Ok);
+  // Long
+  {
+    Tensor* tensor = createTestTensor(
+        sizes,
+        {},
+        static_cast<int32_t>(slim_c10::ScalarType::Long),
+        static_cast<int32_t>(slim_c10::DeviceType::CPU),
+        0);
+    ASSERT_NE(tensor, nullptr);
+    EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
+  }
 
-  // Second deletion should fail (tensor no longer tracked)
-  AOTITorchError error2 = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error2, Error::InvalidArgument);
+  // Bool
+  {
+    Tensor* tensor = createTestTensor(
+        sizes,
+        {},
+        static_cast<int32_t>(slim_c10::ScalarType::Bool),
+        static_cast<int32_t>(slim_c10::DeviceType::CPU),
+        0);
+    ASSERT_NE(tensor, nullptr);
+    EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
+  }
 }
 
-// Test deletion of tensors on both CUDA and CPU devices
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteMixedDeviceTensors) {
-  // Create CUDA tensor
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* cuda_tensor = create_test_tensor(sizes, {}, 6, 1, 0);
-  ASSERT_NE(cuda_tensor, nullptr);
-
-  // Create CPU tensor
-  Tensor* cpu_tensor = create_test_tensor(sizes, {}, 6, 0, 0);
-  ASSERT_NE(cpu_tensor, nullptr);
+// ============================================================================
+// CUDA Tests
+// ============================================================================
 
-  // Delete both tensors
-  AOTITorchError cuda_error = aoti_torch_delete_tensor_object(cuda_tensor);
-  EXPECT_EQ(cuda_error, Error::Ok);
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteCudaTensorBasic) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
 
-  AOTITorchError cpu_error = aoti_torch_delete_tensor_object(cpu_tensor);
-  EXPECT_EQ(cpu_error, Error::Ok);
-}
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(tensor, nullptr);
 
-// Test memory consistency after deletion
-TEST_F(AOTITorchDeleteTensorObjectTest, MemoryConsistencyAfterDeletion) {
-  // Create multiple tensors
-  std::vector<Tensor*> tensors;
-  const int num_tensors = 10;
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_TRUE(tensor->is_cuda());
 
-  for (int i = 0; i < num_tensors; i++) {
-    std::vector<int64_t> sizes = {i + 1, i + 2};
-    Tensor* tensor = create_test_tensor(sizes);
-    ASSERT_NE(tensor, nullptr);
-    tensors.push_back(tensor);
-  }
+  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+}
 
-  // Delete every other tensor
-  for (int i = 0; i < num_tensors; i += 2) {
-    AOTITorchError error = aoti_torch_delete_tensor_object(tensors[i]);
-    EXPECT_EQ(error, Error::Ok);
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMultipleTensors_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
   }
 
-  // Delete remaining tensors
-  for (int i = 1; i < num_tensors; i += 2) {
-    AOTITorchError error = aoti_torch_delete_tensor_object(tensors[i]);
-    EXPECT_EQ(error, Error::Ok);
-  }
-}
-
-// Test stress deletion with many small tensors
-TEST_F(AOTITorchDeleteTensorObjectTest, StressDeletionManySmallTensors) {
-  const int num_tensors = 100;
   std::vector<Tensor*> tensors;
 
-  // Create many small tensors
-  for (int i = 0; i < num_tensors; i++) {
-    std::vector<int64_t> sizes = {1, 1}; // Minimal size
-    Tensor* tensor = create_test_tensor(sizes);
-    if (tensor != nullptr) {
-      tensors.push_back(tensor);
-    }
+  for (int i = 1; i <= 5; i++) {
+    std::vector<int64_t> sizes = {i, i + 1};
+    Tensor* tensor = createTestTensor(
+        sizes,
+        {},
+        static_cast<int32_t>(slim_c10::ScalarType::Float),
+        static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+        0);
+    ASSERT_NE(tensor, nullptr);
+    tensors.push_back(tensor);
   }
 
-  // Delete all created tensors
   for (Tensor* tensor : tensors) {
     AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
     EXPECT_EQ(error, Error::Ok);
   }
 }
 
-// Test CUDA synchronization during deletion
-TEST_F(AOTITorchDeleteTensorObjectTest, CudaSynchronizationDuringDeletion) {
-  // Create a larger CUDA tensor to ensure memory allocation
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteLargeTensor_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
   std::vector<int64_t> sizes = {100, 100};
-  Tensor* tensor = create_test_tensor(sizes, {}, 6, 1, 0); // CUDA device
+  Tensor* tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
   ASSERT_NE(tensor, nullptr);
 
-  // Delete the tensor (should handle synchronization internally)
   AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
   EXPECT_EQ(error, Error::Ok);
 
@@ -330,125 +323,63 @@ TEST_F(AOTITorchDeleteTensorObjectTest, CudaSynchronizationDuringDeletion) {
   EXPECT_EQ(cuda_error, cudaSuccess);
 }
 
-// Test specific deletion of bfloat16 tensors
-TEST_F(AOTITorchDeleteTensorObjectTest, DeleteBFloat16Tensor) {
-  // Test 1D bfloat16 tensor deletion
-  std::vector<int64_t> sizes_1d = {10};
-  Tensor* tensor_bf16_1d = create_test_tensor(
-      sizes_1d,
-      {},
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
-      1, // CUDA device
-      0);
-  ASSERT_NE(tensor_bf16_1d, nullptr);
-
-  // Verify it's bfloat16 before deletion
-  int32_t actual_dtype;
-  EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_1d, &actual_dtype), Error::Ok);
-  EXPECT_EQ(actual_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
-      << "Expected bfloat16 dtype ("
-      << static_cast<int32_t>(SupportedDTypes::BFLOAT16) << "), got "
-      << actual_dtype;
-
-  // Verify element size (bfloat16 should be 2 bytes per element)
-  EXPECT_EQ(tensor_bf16_1d->element_size(), 2);
-
-  // Delete the bfloat16 tensor
-  AOTITorchError error = aoti_torch_delete_tensor_object(tensor_bf16_1d);
-  EXPECT_EQ(error, Error::Ok);
-
-  // Test 2D bfloat16 tensor deletion with custom strides
-  std::vector<int64_t> sizes_2d = {4, 6};
-  std::vector<int64_t> strides_2d = {6, 1}; // Row-major strides
-  Tensor* tensor_bf16_2d = create_test_tensor(
-      sizes_2d,
-      strides_2d,
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
-      1, // CUDA device
-      0);
-  ASSERT_NE(tensor_bf16_2d, nullptr);
-
-  // Verify tensor properties
-  EXPECT_EQ(tensor_bf16_2d->dim(), 2);
-  EXPECT_EQ(tensor_bf16_2d->size(0), 4);
-  EXPECT_EQ(tensor_bf16_2d->size(1), 6);
-  EXPECT_EQ(tensor_bf16_2d->element_size(), 2);
-
-  // Verify it's bfloat16
-  int32_t dtype_2d;
-  EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_2d, &dtype_2d), Error::Ok);
-  EXPECT_EQ(dtype_2d, static_cast<int32_t>(SupportedDTypes::BFLOAT16));
+TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMixedDeviceTensors) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
 
-  // Delete the 2D bfloat16 tensor
-  error = aoti_torch_delete_tensor_object(tensor_bf16_2d);
-  EXPECT_EQ(error, Error::Ok);
+  std::vector<int64_t> sizes = {2, 3};
 
-  // Test 3D bfloat16 tensor deletion
-  std::vector<int64_t> sizes_3d = {2, 3, 4};
-  Tensor* tensor_bf16_3d = create_test_tensor(
-      sizes_3d,
+  // Create CUDA tensor
+  Tensor* cuda_tensor = createTestTensor(
+      sizes,
       {},
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
-      1, // CUDA device
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
       0);
-  ASSERT_NE(tensor_bf16_3d, nullptr);
-
-  // Verify tensor properties
-  EXPECT_EQ(tensor_bf16_3d->dim(), 3);
-  EXPECT_EQ(tensor_bf16_3d->size(0), 2);
-  EXPECT_EQ(tensor_bf16_3d->size(1), 3);
-  EXPECT_EQ(tensor_bf16_3d->size(2), 4);
-  EXPECT_EQ(tensor_bf16_3d->element_size(), 2);
-
-  // Verify memory size (2 * 3 * 4 * 2 bytes = 48 bytes)
-  size_t expected_memory = 2 * 3 * 4 * 2;
-  size_t actual_memory =
-      tensor_bf16_3d->numel() * tensor_bf16_3d->element_size();
-  EXPECT_EQ(actual_memory, expected_memory);
-
-  // Delete the 3D bfloat16 tensor
-  error = aoti_torch_delete_tensor_object(tensor_bf16_3d);
-  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(cuda_tensor, nullptr);
+  EXPECT_TRUE(cuda_tensor->is_cuda());
 
-  // Test bfloat16 scalar tensor (0D) deletion
-  std::vector<int64_t> scalar_sizes = {};
-  Tensor* tensor_bf16_scalar = create_test_tensor(
-      scalar_sizes,
+  // Create CPU tensor
+  Tensor* cpu_tensor = createTestTensor(
+      sizes,
       {},
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
-      1, // CUDA device
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
       0);
-  ASSERT_NE(tensor_bf16_scalar, nullptr);
+  ASSERT_NE(cpu_tensor, nullptr);
+  EXPECT_TRUE(cpu_tensor->is_cpu());
 
-  // Verify scalar tensor properties
-  EXPECT_EQ(tensor_bf16_scalar->dim(), 0);
-  EXPECT_EQ(tensor_bf16_scalar->numel(), 1);
-  EXPECT_EQ(tensor_bf16_scalar->element_size(), 2);
+  // Delete both tensors
+  EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_tensor), Error::Ok);
+}
 
-  // Delete the scalar bfloat16 tensor
-  error = aoti_torch_delete_tensor_object(tensor_bf16_scalar);
-  EXPECT_EQ(error, Error::Ok);
+// ============================================================================
+// Stress Tests
+// ============================================================================
 
-  // Test zero-element bfloat16 tensor deletion
-  std::vector<int64_t> zero_sizes = {0, 5};
-  Tensor* tensor_bf16_zero = create_test_tensor(
-      zero_sizes,
-      {},
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
-      1, // CUDA device
-      0);
-  ASSERT_NE(tensor_bf16_zero, nullptr);
+TEST_F(
+    AOTITorchDeleteTensorObjectSlimTest,
+    StressDeletionManySmallTensors_CPU) {
+  const int num_tensors = 100;
+  std::vector<Tensor*> tensors;
 
-  // Verify zero-element tensor properties
-  EXPECT_EQ(tensor_bf16_zero->dim(), 2);
-  EXPECT_EQ(tensor_bf16_zero->size(0), 0);
-  EXPECT_EQ(tensor_bf16_zero->size(1), 5);
-  EXPECT_EQ(tensor_bf16_zero->numel(), 0);
-  EXPECT_EQ(tensor_bf16_zero->element_size(), 2);
+  for (int i = 0; i < num_tensors; i++) {
+    std::vector<int64_t> sizes = {1, 1};
+    Tensor* tensor = createTestTensor(
+        sizes,
+        {},
+        static_cast<int32_t>(slim_c10::ScalarType::Float),
+        static_cast<int32_t>(slim_c10::DeviceType::CPU),
+        0);
+    if (tensor != nullptr) {
+      tensors.push_back(tensor);
+    }
+  }
 
-  // Delete the zero-element bfloat16 tensor
-  error = aoti_torch_delete_tensor_object(tensor_bf16_zero);
-  EXPECT_EQ(error, Error::Ok);
+  for (Tensor* tensor : tensors) {
+    AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+    EXPECT_EQ(error, Error::Ok);
+  }
 }
-
-// Test deletion of mixed dtype tensors (float32 and bfloat16)
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object_slim.cpp
deleted file mode 100644
index e88ebb3185c..00000000000
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object_slim.cpp
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <vector>
-
-#include <executorch/backends/aoti/slim/c10/core/Device.h>
-#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
-#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
-
-using namespace executorch::backends::cuda;
-using executorch::runtime::Error;
-
-namespace slim_c10 = executorch::backends::aoti::slim::c10;
-
-namespace {
-
-bool isCudaAvailable() {
-  int device_count = 0;
-  cudaError_t err = cudaGetDeviceCount(&device_count);
-  return (err == cudaSuccess && device_count > 0);
-}
-
-std::vector<int64_t> calculateContiguousStrides(
-    const std::vector<int64_t>& sizes) {
-  std::vector<int64_t> strides(sizes.size());
-  if (sizes.empty()) {
-    return strides;
-  }
-  strides[sizes.size() - 1] = 1;
-  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-    strides[i] = strides[i + 1] * sizes[i + 1];
-  }
-  return strides;
-}
-
-} // namespace
-
-class AOTITorchDeleteTensorObjectSlimTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    et_pal_init();
-  }
-
-  void TearDown() override {
-    // SlimTensor uses automatic reference counting - no manual cleanup needed
-  }
-
-  Tensor* createTestTensor(
-      const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides = {},
-      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
-      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      int32_t device_index = 0) {
-    Tensor* tensor = nullptr;
-
-    std::vector<int64_t> effective_strides = strides;
-    if (strides.empty()) {
-      effective_strides = calculateContiguousStrides(sizes);
-    }
-
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        effective_strides.data(),
-        dtype,
-        device_type,
-        device_index,
-        &tensor);
-
-    return (error == Error::Ok) ? tensor : nullptr;
-  }
-};
-
-// ============================================================================
-// CPU Tests
-// ============================================================================
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteCpuTensorBasic) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 2);
-  EXPECT_EQ(tensor->size(1), 3);
-
-  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
-}
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteNullTensor) {
-  AOTITorchError error = aoti_torch_delete_tensor_object(nullptr);
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMultipleTensors_CPU) {
-  std::vector<Tensor*> tensors;
-
-  for (int i = 1; i <= 5; i++) {
-    std::vector<int64_t> sizes = {i, i + 1};
-    Tensor* tensor = createTestTensor(
-        sizes,
-        {},
-        static_cast<int32_t>(slim_c10::ScalarType::Float),
-        static_cast<int32_t>(slim_c10::DeviceType::CPU),
-        0);
-    ASSERT_NE(tensor, nullptr);
-    tensors.push_back(tensor);
-  }
-
-  for (Tensor* tensor : tensors) {
-    AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-    EXPECT_EQ(error, Error::Ok);
-  }
-}
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteZeroSizedTensor_CPU) {
-  std::vector<int64_t> sizes = {0, 5};
-  Tensor* tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 0);
-  EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->numel(), 0);
-
-  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
-}
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteScalarTensor_CPU) {
-  std::vector<int64_t> sizes = {};
-  Tensor* tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->dim(), 0);
-  EXPECT_EQ(tensor->numel(), 1);
-
-  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
-}
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteLargeTensor_CPU) {
-  std::vector<int64_t> sizes = {10, 20, 30};
-  Tensor* tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->dim(), 3);
-  EXPECT_EQ(tensor->numel(), 6000);
-
-  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
-}
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteTensorWithCustomStrides_CPU) {
-  std::vector<int64_t> sizes = {3, 4};
-  std::vector<int64_t> strides = {1, 3}; // Column-major
-  Tensor* tensor = createTestTensor(
-      sizes,
-      strides,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->stride(0), 1);
-  EXPECT_EQ(tensor->stride(1), 3);
-
-  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
-}
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteDifferentDtypes_CPU) {
-  std::vector<int64_t> sizes = {2, 3};
-
-  // Float
-  {
-    Tensor* tensor = createTestTensor(
-        sizes,
-        {},
-        static_cast<int32_t>(slim_c10::ScalarType::Float),
-        static_cast<int32_t>(slim_c10::DeviceType::CPU),
-        0);
-    ASSERT_NE(tensor, nullptr);
-    EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-  }
-
-  // BFloat16
-  {
-    Tensor* tensor = createTestTensor(
-        sizes,
-        {},
-        static_cast<int32_t>(slim_c10::ScalarType::BFloat16),
-        static_cast<int32_t>(slim_c10::DeviceType::CPU),
-        0);
-    ASSERT_NE(tensor, nullptr);
-    EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-  }
-
-  // Long
-  {
-    Tensor* tensor = createTestTensor(
-        sizes,
-        {},
-        static_cast<int32_t>(slim_c10::ScalarType::Long),
-        static_cast<int32_t>(slim_c10::DeviceType::CPU),
-        0);
-    ASSERT_NE(tensor, nullptr);
-    EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-  }
-
-  // Bool
-  {
-    Tensor* tensor = createTestTensor(
-        sizes,
-        {},
-        static_cast<int32_t>(slim_c10::ScalarType::Bool),
-        static_cast<int32_t>(slim_c10::DeviceType::CPU),
-        0);
-    ASSERT_NE(tensor, nullptr);
-    EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-  }
-}
-
-// ============================================================================
-// CUDA Tests
-// ============================================================================
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteCudaTensorBasic) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_TRUE(tensor->is_cuda());
-
-  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
-}
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMultipleTensors_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<Tensor*> tensors;
-
-  for (int i = 1; i <= 5; i++) {
-    std::vector<int64_t> sizes = {i, i + 1};
-    Tensor* tensor = createTestTensor(
-        sizes,
-        {},
-        static_cast<int32_t>(slim_c10::ScalarType::Float),
-        static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-        0);
-    ASSERT_NE(tensor, nullptr);
-    tensors.push_back(tensor);
-  }
-
-  for (Tensor* tensor : tensors) {
-    AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-    EXPECT_EQ(error, Error::Ok);
-  }
-}
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteLargeTensor_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {100, 100};
-  Tensor* tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(tensor, nullptr);
-
-  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-  EXPECT_EQ(error, Error::Ok);
-
-  // Verify CUDA state is still good
-  cudaError_t cuda_error = cudaGetLastError();
-  EXPECT_EQ(cuda_error, cudaSuccess);
-}
-
-TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMixedDeviceTensors) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-
-  // Create CUDA tensor
-  Tensor* cuda_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(cuda_tensor, nullptr);
-  EXPECT_TRUE(cuda_tensor->is_cuda());
-
-  // Create CPU tensor
-  Tensor* cpu_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(cpu_tensor, nullptr);
-  EXPECT_TRUE(cpu_tensor->is_cpu());
-
-  // Delete both tensors
-  EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_tensor), Error::Ok);
-}
-
-// ============================================================================
-// Stress Tests
-// ============================================================================
-
-TEST_F(
-    AOTITorchDeleteTensorObjectSlimTest,
-    StressDeletionManySmallTensors_CPU) {
-  const int num_tensors = 100;
-  std::vector<Tensor*> tensors;
-
-  for (int i = 0; i < num_tensors; i++) {
-    std::vector<int64_t> sizes = {1, 1};
-    Tensor* tensor = createTestTensor(
-        sizes,
-        {},
-        static_cast<int32_t>(slim_c10::ScalarType::Float),
-        static_cast<int32_t>(slim_c10::DeviceType::CPU),
-        0);
-    if (tensor != nullptr) {
-      tensors.push_back(tensor);
-    }
-  }
-
-  for (Tensor* tensor : tensors) {
-    AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
-    EXPECT_EQ(error, Error::Ok);
-  }
-}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
index 799a8d1221b..d563eea98bc 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
@@ -7,661 +7,461 @@
  */
 
 #include <cuda_runtime.h>
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/cuda/runtime/shims/memory.h>
-#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/utils.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
 #include <vector>
 
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
+#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+
 using namespace executorch::backends::cuda;
-using namespace executorch::backends::aoti;
-using namespace executorch::runtime;
-using executorch::runtime::etensor::Tensor;
+using executorch::runtime::Error;
 
-// Test fixture for aoti_torch_empty_strided tests
-class AOTITorchEmptyStridedTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Initialize ExecuTorch Platform Abstraction Layer
-    et_pal_init();
+namespace slim_c10 = executorch::backends::aoti::slim::c10;
 
-    // Check if CUDA is available
-    int device_count = 0;
-    cudaError_t err = cudaGetDeviceCount(&device_count);
-    if (err != cudaSuccess || device_count == 0) {
-      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
-    }
+namespace {
 
-    // Clean up any existing cached metadata before each test
-    cleanup_tensor_metadata();
+// Helper to check if CUDA is available
+bool isCudaAvailable() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  return (err == cudaSuccess && device_count > 0);
+}
 
-    // Clear any remaining tensors from previous tests
-    clear_all_tensors();
+// Helper to calculate contiguous strides from sizes
+std::vector<int64_t> calculateContiguousStrides(
+    const std::vector<int64_t>& sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  if (sizes.empty()) {
+    return strides;
   }
-
-  void TearDown() override {
-    // Clean up metadata
-    cleanup_tensor_metadata();
-
-    // Clear the global tensor storage using the provided function
-    clear_all_tensors();
+  strides[sizes.size() - 1] = 1;
+  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * sizes[i + 1];
   }
+  return strides;
+}
 
-  // Helper to create test tensors
-  Tensor* create_tracked_tensor(
-      const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides = {},
-      int32_t dtype = static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      int32_t device_type = static_cast<int32_t>(SupportedDevices::CUDA),
-      int32_t device_index = 0) {
-    Tensor* tensor;
+} // namespace
 
-    const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data();
+// Test fixture for SlimTensor-based aoti_torch_empty_strided tests
+class AOTITorchEmptyStridedSlimTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+  }
 
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        strides_ptr,
-        dtype,
-        device_type,
-        device_index,
-        &tensor);
+  void TearDown() override {
+    // Tensors are cleaned up via their destructors
+    for (Tensor* t : tensors_) {
+      delete t;
+    }
+    tensors_.clear();
+  }
 
-    return (error == Error::Ok) ? tensor : nullptr;
+  // Track tensors for cleanup
+  void trackTensor(Tensor* t) {
+    if (t != nullptr) {
+      tensors_.push_back(t);
+    }
   }
+
+ private:
+  std::vector<Tensor*> tensors_;
 };
 
-// Test aoti_torch_empty_strided basic functionality
-TEST_F(AOTITorchEmptyStridedTest, BasicFunctionality) {
+// ============================================================================
+// Common test body - parameterized by device type
+// ============================================================================
+
+void runBasicEmptyStridedTest(int32_t device_type, int32_t device_index) {
   // Test 1D tensor
   std::vector<int64_t> sizes_1d = {5};
-  Tensor* tensor_1d;
+  std::vector<int64_t> strides_1d = calculateContiguousStrides(sizes_1d);
+
+  Tensor* tensor_1d = nullptr;
   AOTITorchError error = aoti_torch_empty_strided(
       sizes_1d.size(),
       sizes_1d.data(),
-      nullptr, // Let function compute strides
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
+      strides_1d.data(),
+      static_cast<int32_t>(slim_c10::ScalarType::Float), // dtype = 6
+      device_type,
+      device_index,
       &tensor_1d);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor_1d, nullptr);
-
-  // CRITICAL: Verify the tensor is actually float32
-  int32_t actual_dtype;
-  EXPECT_EQ(aoti_torch_get_dtype(tensor_1d, &actual_dtype), Error::Ok);
-  EXPECT_EQ(actual_dtype, static_cast<int32_t>(SupportedDTypes::FLOAT32))
-      << "Expected float32 dtype ("
-      << static_cast<int32_t>(SupportedDTypes::FLOAT32) << "), got "
-      << actual_dtype;
-
-  // Verify element size (float32 should be 4 bytes per element)
-  size_t element_size = tensor_1d->element_size();
-  EXPECT_EQ(element_size, 4)
-      << "Expected float32 element size to be 4 bytes, got " << element_size;
-
-  // Verify total number of elements and memory usage
-  int64_t expected_numel = 5; // 5 elements
-  EXPECT_EQ(tensor_1d->numel(), expected_numel)
-      << "Expected " << expected_numel << " elements, got "
-      << tensor_1d->numel();
-
-  // Verify total memory size (numel * element_size)
-  size_t expected_memory_size = expected_numel * 4; // 5 * 4 = 20 bytes
-  size_t actual_memory_size = tensor_1d->numel() * tensor_1d->element_size();
-  EXPECT_EQ(actual_memory_size, expected_memory_size)
-      << "Expected " << expected_memory_size << " bytes, got "
-      << actual_memory_size;
+  ASSERT_NE(tensor_1d, nullptr);
 
   // Check tensor properties
   EXPECT_EQ(tensor_1d->dim(), 1);
   EXPECT_EQ(tensor_1d->size(0), 5);
+  EXPECT_EQ(tensor_1d->numel(), 5);
+  EXPECT_EQ(
+      static_cast<int32_t>(tensor_1d->dtype()),
+      static_cast<int32_t>(slim_c10::ScalarType::Float));
+  EXPECT_NE(tensor_1d->data_ptr(), nullptr);
 
-  // Test 2D tensor with explicit strides
-  std::vector<int64_t> sizes_2d = {3, 4};
-  std::vector<int64_t> strides_2d = {4, 1};
-  Tensor* tensor_2d;
-  error = aoti_torch_empty_strided(
-      sizes_2d.size(),
-      sizes_2d.data(),
-      strides_2d.data(),
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
-      &tensor_2d);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor_2d, nullptr);
-
-  // Verify 2D tensor is also float32
-  int32_t dtype_2d;
-  EXPECT_EQ(aoti_torch_get_dtype(tensor_2d, &dtype_2d), Error::Ok);
-  EXPECT_EQ(dtype_2d, static_cast<int32_t>(SupportedDTypes::FLOAT32))
-      << "Expected float32 dtype ("
-      << static_cast<int32_t>(SupportedDTypes::FLOAT32) << "), got "
-      << dtype_2d;
-
-  // Verify element size for 2D tensor
-  EXPECT_EQ(tensor_2d->element_size(), 4);
-
-  // Check tensor properties
-  EXPECT_EQ(tensor_2d->dim(), 2);
-  EXPECT_EQ(tensor_2d->size(0), 3);
-  EXPECT_EQ(tensor_2d->size(1), 4);
-
-  // Verify memory size for 2D tensor
-  int64_t expected_numel_2d = 3 * 4; // 12 elements
-  size_t expected_memory_2d = expected_numel_2d * 4; // 12 * 4 = 48 bytes
-  EXPECT_EQ(tensor_2d->numel() * tensor_2d->element_size(), expected_memory_2d);
+  // Cleanup
+  delete tensor_1d;
 }
 
-// Test aoti_torch_empty_strided with CPU device
-TEST_F(AOTITorchEmptyStridedTest, CPUDevice) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* tensor;
+void runMultiDimensionalEmptyStridedTest(
+    int32_t device_type,
+    int32_t device_index) {
+  // Test 3D tensor
+  std::vector<int64_t> sizes = {2, 3, 4};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_empty_strided(
       sizes.size(),
       sizes.data(),
-      nullptr, // Let function compute strides
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CPU),
-      0, // device index
+      strides.data(),
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
       &tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor, nullptr);
+  ASSERT_NE(tensor, nullptr);
 
   // Check tensor properties
-  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->dim(), 3);
   EXPECT_EQ(tensor->size(0), 2);
   EXPECT_EQ(tensor->size(1), 3);
-}
+  EXPECT_EQ(tensor->size(2), 4);
+  EXPECT_EQ(tensor->numel(), 24);
 
-// Test aoti_torch_empty_strided with invalid dtype
-TEST_F(AOTITorchEmptyStridedTest, InvalidDtype) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* tensor;
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      nullptr,
-      999, // invalid dtype
-      1, // CUDA device
-      0, // device index
-      &tensor);
+  // Check strides
+  EXPECT_EQ(tensor->stride(0), 12);
+  EXPECT_EQ(tensor->stride(1), 4);
+  EXPECT_EQ(tensor->stride(2), 1);
 
-  EXPECT_EQ(error, Error::InvalidArgument);
+  delete tensor;
 }
 
-// Test aoti_torch_empty_strided with unsupported device
-TEST_F(AOTITorchEmptyStridedTest, UnsupportedDevice) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* tensor;
+void runScalarTensorEmptyStridedTest(
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {}; // 0D tensor
+  std::vector<int64_t> strides = {};
+
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_empty_strided(
       sizes.size(),
       sizes.data(),
-      nullptr,
-      6, // float32
-      2, // unsupported device type
-      0, // device index
+      strides.data(),
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
       &tensor);
 
-  EXPECT_EQ(error, Error::NotImplemented);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->numel(), 1);
+  EXPECT_NE(tensor->data_ptr(), nullptr);
+
+  delete tensor;
 }
 
-// Test aoti_torch_empty_strided with zero-sized tensor
-TEST_F(AOTITorchEmptyStridedTest, ZeroSized) {
+void runZeroSizedTensorEmptyStridedTest(
+    int32_t device_type,
+    int32_t device_index) {
   std::vector<int64_t> sizes = {0, 5};
-  Tensor* tensor;
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_empty_strided(
       sizes.size(),
       sizes.data(),
-      nullptr,
-      6, // float32
-      1, // CUDA device
-      0, // device index
+      strides.data(),
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
       &tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor, nullptr);
+  ASSERT_NE(tensor, nullptr);
 
-  // Check tensor properties
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 0);
   EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->numel(), 0);
+
+  delete tensor;
 }
 
-// Test aoti_torch_empty_strided scalar tensor (0D)
-TEST_F(AOTITorchEmptyStridedTest, Scalar) {
-  std::vector<int64_t> sizes = {};
-  Tensor* tensor;
+void runCustomStridesEmptyStridedTest(
+    int32_t device_type,
+    int32_t device_index) {
+  // Create a transposed (column-major) tensor
+  std::vector<int64_t> sizes = {3, 4};
+  std::vector<int64_t> strides = {1, 3}; // Column-major
+
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_empty_strided(
       sizes.size(),
       sizes.data(),
-      nullptr,
-      6, // float32
-      1, // CUDA device
-      0, // device index
+      strides.data(),
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
       &tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor, nullptr);
-
-  // Check tensor properties
-  EXPECT_EQ(tensor->dim(), 0);
-}
+  ASSERT_NE(tensor, nullptr);
 
-// Test aoti_torch_empty_strided with large tensor
-TEST_F(AOTITorchEmptyStridedTest, LargeTensor) {
-  std::vector<int64_t> sizes = {100, 200, 50};
-  Tensor* tensor;
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      nullptr,
-      6, // float32
-      1, // CUDA device
-      0, // device index
-      &tensor);
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->size(1), 4);
+  EXPECT_EQ(tensor->stride(0), 1);
+  EXPECT_EQ(tensor->stride(1), 3);
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor, nullptr);
+  // Non-contiguous due to custom strides
+  EXPECT_FALSE(tensor->is_contiguous());
 
-  // Check tensor properties
-  EXPECT_EQ(tensor->dim(), 3);
-  EXPECT_EQ(tensor->size(0), 100);
-  EXPECT_EQ(tensor->size(1), 200);
-  EXPECT_EQ(tensor->size(2), 50);
+  delete tensor;
 }
 
-// Test aoti_torch_empty_strided with bfloat16 dtype
-TEST_F(AOTITorchEmptyStridedTest, BFloat16Tensor) {
-  // Test creating bfloat16 tensor on CUDA
-  std::vector<int64_t> sizes = {2, 3, 4};
-  Tensor* tensor_bf16;
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      nullptr, // Let function compute strides
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
-      &tensor_bf16);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor_bf16, nullptr);
-
-  // CRITICAL: Verify the tensor is actually bfloat16
-  int32_t actual_dtype;
-  EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16, &actual_dtype), Error::Ok);
-  EXPECT_EQ(actual_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
-      << "Expected bfloat16 dtype ("
-      << static_cast<int32_t>(SupportedDTypes::BFLOAT16) << "), got "
-      << actual_dtype;
-
-  // Verify element size (bfloat16 should be 2 bytes per element)
-  size_t element_size = tensor_bf16->element_size();
-  EXPECT_EQ(element_size, 2)
-      << "Expected bfloat16 element size to be 2 bytes, got " << element_size;
-
-  // Verify total number of elements and memory usage
-  int64_t expected_numel = 2 * 3 * 4; // 24 elements
-  EXPECT_EQ(tensor_bf16->numel(), expected_numel)
-      << "Expected " << expected_numel << " elements, got "
-      << tensor_bf16->numel();
-
-  // Verify total memory size (numel * element_size)
-  size_t expected_memory_size = expected_numel * 2; // 24 * 2 = 48 bytes
-  size_t actual_memory_size =
-      tensor_bf16->numel() * tensor_bf16->element_size();
-  EXPECT_EQ(actual_memory_size, expected_memory_size)
-      << "Expected " << expected_memory_size << " bytes, got "
-      << actual_memory_size;
+void runDifferentDtypesEmptyStridedTest(
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
 
-  // Check tensor properties
-  EXPECT_EQ(tensor_bf16->dim(), 3);
-  EXPECT_EQ(tensor_bf16->size(0), 2);
-  EXPECT_EQ(tensor_bf16->size(1), 3);
-  EXPECT_EQ(tensor_bf16->size(2), 4);
-
-  // Verify we can get tensor metadata
-  int64_t* sizes_ptr;
-  int64_t* strides_ptr;
-  EXPECT_EQ(aoti_torch_get_sizes(tensor_bf16, &sizes_ptr), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_strides(tensor_bf16, &strides_ptr), Error::Ok);
-
-  // Check sizes match
-  EXPECT_EQ(sizes_ptr[0], 2);
-  EXPECT_EQ(sizes_ptr[1], 3);
-  EXPECT_EQ(sizes_ptr[2], 4);
-
-  // Check that strides are computed correctly (row-major order)
-  EXPECT_EQ(strides_ptr[0], 12); // 3 * 4
-  EXPECT_EQ(strides_ptr[1], 4); // 4
-  EXPECT_EQ(strides_ptr[2], 1); // 1
-
-  // Test bfloat16 tensor with custom strides
-  std::vector<int64_t> sizes_2d = {3, 2};
-  std::vector<int64_t> strides_2d = {2, 1}; // Row-major strides
-  Tensor* tensor_bf16_custom;
-  error = aoti_torch_empty_strided(
-      sizes_2d.size(),
-      sizes_2d.data(),
-      strides_2d.data(),
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
-      &tensor_bf16_custom);
+  // Test Float32
+  {
+    Tensor* tensor = nullptr;
+    AOTITorchError error = aoti_torch_empty_strided(
+        sizes.size(),
+        sizes.data(),
+        strides.data(),
+        static_cast<int32_t>(slim_c10::ScalarType::Float),
+        device_type,
+        device_index,
+        &tensor);
+    EXPECT_EQ(error, Error::Ok);
+    ASSERT_NE(tensor, nullptr);
+    EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Float);
+    EXPECT_EQ(tensor->itemsize(), 4);
+    delete tensor;
+  }
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor_bf16_custom, nullptr);
+  // Test BFloat16
+  {
+    Tensor* tensor = nullptr;
+    AOTITorchError error = aoti_torch_empty_strided(
+        sizes.size(),
+        sizes.data(),
+        strides.data(),
+        static_cast<int32_t>(slim_c10::ScalarType::BFloat16),
+        device_type,
+        device_index,
+        &tensor);
+    EXPECT_EQ(error, Error::Ok);
+    ASSERT_NE(tensor, nullptr);
+    EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::BFloat16);
+    EXPECT_EQ(tensor->itemsize(), 2);
+    delete tensor;
+  }
 
-  // Verify custom stride tensor is also bfloat16
-  int32_t custom_dtype;
-  EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_custom, &custom_dtype), Error::Ok);
-  EXPECT_EQ(custom_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
-      << "Expected bfloat16 dtype ("
-      << static_cast<int32_t>(SupportedDTypes::BFLOAT16) << "), got "
-      << custom_dtype;
+  // Test Int64
+  {
+    Tensor* tensor = nullptr;
+    AOTITorchError error = aoti_torch_empty_strided(
+        sizes.size(),
+        sizes.data(),
+        strides.data(),
+        static_cast<int32_t>(slim_c10::ScalarType::Long),
+        device_type,
+        device_index,
+        &tensor);
+    EXPECT_EQ(error, Error::Ok);
+    ASSERT_NE(tensor, nullptr);
+    EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Long);
+    EXPECT_EQ(tensor->itemsize(), 8);
+    delete tensor;
+  }
 
-  // Verify element size for custom stride tensor
-  EXPECT_EQ(tensor_bf16_custom->element_size(), 2);
+  // Test Bool
+  {
+    Tensor* tensor = nullptr;
+    AOTITorchError error = aoti_torch_empty_strided(
+        sizes.size(),
+        sizes.data(),
+        strides.data(),
+        static_cast<int32_t>(slim_c10::ScalarType::Bool),
+        device_type,
+        device_index,
+        &tensor);
+    EXPECT_EQ(error, Error::Ok);
+    ASSERT_NE(tensor, nullptr);
+    EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Bool);
+    EXPECT_EQ(tensor->itemsize(), 1);
+    delete tensor;
+  }
+}
 
-  // Check tensor properties
-  EXPECT_EQ(tensor_bf16_custom->dim(), 2);
-  EXPECT_EQ(tensor_bf16_custom->size(0), 3);
-  EXPECT_EQ(tensor_bf16_custom->size(1), 2);
+// ============================================================================
+// CPU Tests
+// ============================================================================
 
-  // Verify memory size for custom stride tensor
-  int64_t custom_expected_numel = 3 * 2; // 6 elements
-  size_t custom_expected_memory = custom_expected_numel * 2; // 6 * 2 = 12 bytes
-  EXPECT_EQ(
-      tensor_bf16_custom->numel() * tensor_bf16_custom->element_size(),
-      custom_expected_memory);
+TEST_F(AOTITorchEmptyStridedSlimTest, BasicFunctionality_CPU) {
+  runBasicEmptyStridedTest(static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
 
-  // Check custom strides
-  int64_t* custom_strides_ptr;
-  EXPECT_EQ(
-      aoti_torch_get_strides(tensor_bf16_custom, &custom_strides_ptr),
-      Error::Ok);
-  EXPECT_EQ(custom_strides_ptr[0], 2);
-  EXPECT_EQ(custom_strides_ptr[1], 1);
-
-  // Test bfloat16 scalar tensor (0D)
-  std::vector<int64_t> scalar_sizes = {};
-  Tensor* tensor_bf16_scalar;
-  error = aoti_torch_empty_strided(
-      scalar_sizes.size(),
-      scalar_sizes.data(),
-      nullptr,
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
-      &tensor_bf16_scalar);
+TEST_F(AOTITorchEmptyStridedSlimTest, MultiDimensional_CPU) {
+  runMultiDimensionalEmptyStridedTest(
+      static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor_bf16_scalar, nullptr);
-  EXPECT_EQ(tensor_bf16_scalar->dim(), 0);
-
-  // Verify scalar tensor is also bfloat16
-  int32_t scalar_dtype;
-  EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_scalar, &scalar_dtype), Error::Ok);
-  EXPECT_EQ(scalar_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
-      << "Expected bfloat16 dtype ("
-      << static_cast<int32_t>(SupportedDTypes::BFLOAT16) << "), got "
-      << scalar_dtype;
-
-  // Verify scalar tensor properties
-  EXPECT_EQ(tensor_bf16_scalar->element_size(), 2);
-  EXPECT_EQ(tensor_bf16_scalar->numel(), 1); // Scalar tensor has 1 element
-  EXPECT_EQ(
-      tensor_bf16_scalar->numel() * tensor_bf16_scalar->element_size(),
-      2); // 1 * 2 = 2 bytes
+TEST_F(AOTITorchEmptyStridedSlimTest, ScalarTensor_CPU) {
+  runScalarTensorEmptyStridedTest(
+      static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
 }
 
-// Test custom strides functionality
-TEST_F(AOTITorchEmptyStridedTest, CustomStrides) {
-  // Create tensor with valid custom strides (contiguous layout)
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = {3, 1}; // Standard row-major strides
+TEST_F(AOTITorchEmptyStridedSlimTest, ZeroSizedTensor_CPU) {
+  runZeroSizedTensorEmptyStridedTest(
+      static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
 
-  Tensor* tensor = create_tracked_tensor(sizes, strides);
-  EXPECT_NE(tensor, nullptr);
+TEST_F(AOTITorchEmptyStridedSlimTest, CustomStrides_CPU) {
+  runCustomStridesEmptyStridedTest(
+      static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
 
-  // Verify the tensor was created correctly
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 2);
-  EXPECT_EQ(tensor->size(1), 3);
+TEST_F(AOTITorchEmptyStridedSlimTest, DifferentDtypes_CPU) {
+  runDifferentDtypesEmptyStridedTest(
+      static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
 
-  // Check strides through AOTI interface
-  int64_t* strides_ptr;
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok);
-  EXPECT_EQ(strides_ptr[0], 3);
-  EXPECT_EQ(strides_ptr[1], 1);
+// ============================================================================
+// CUDA Tests
+// ============================================================================
 
-  // Test another valid stride pattern - transpose-like
-  std::vector<int64_t> sizes_2 = {3, 2};
-  std::vector<int64_t> strides_2 = {1, 3}; // Column-major strides
+TEST_F(AOTITorchEmptyStridedSlimTest, BasicFunctionality_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runBasicEmptyStridedTest(static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
+}
 
-  Tensor* tensor_2 = create_tracked_tensor(sizes_2, strides_2);
-  EXPECT_NE(tensor_2, nullptr);
+TEST_F(AOTITorchEmptyStridedSlimTest, MultiDimensional_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runMultiDimensionalEmptyStridedTest(
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
+}
 
-  // Verify the tensor properties
-  EXPECT_EQ(tensor_2->dim(), 2);
-  EXPECT_EQ(tensor_2->size(0), 3);
-  EXPECT_EQ(tensor_2->size(1), 2);
+TEST_F(AOTITorchEmptyStridedSlimTest, ScalarTensor_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runScalarTensorEmptyStridedTest(
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
+}
 
-  // Check strides
-  int64_t* strides_ptr_2;
-  EXPECT_EQ(aoti_torch_get_strides(tensor_2, &strides_ptr_2), Error::Ok);
-  EXPECT_EQ(strides_ptr_2[0], 1);
-  EXPECT_EQ(strides_ptr_2[1], 3);
+TEST_F(AOTITorchEmptyStridedSlimTest, ZeroSizedTensor_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runZeroSizedTensorEmptyStridedTest(
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
 }
 
-// Test edge case: zero-element tensor with non-zero dimensions
-TEST_F(AOTITorchEmptyStridedTest, ZeroElementTensor) {
-  std::vector<int64_t> sizes = {2, 0, 3}; // Total elements = 0
-  Tensor* tensor = create_tracked_tensor(sizes);
-  EXPECT_NE(tensor, nullptr);
+TEST_F(AOTITorchEmptyStridedSlimTest, CustomStrides_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runCustomStridesEmptyStridedTest(
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
+}
 
-  // Verify the tensor properties
-  EXPECT_EQ(tensor->dim(), 3);
-  EXPECT_EQ(tensor->size(0), 2);
-  EXPECT_EQ(tensor->size(1), 0);
-  EXPECT_EQ(tensor->size(2), 3);
-
-  // Should be able to get metadata
-  int64_t* sizes_ptr;
-  int64_t* strides_ptr;
-  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok);
-
-  EXPECT_EQ(sizes_ptr[0], 2);
-  EXPECT_EQ(sizes_ptr[1], 0);
-  EXPECT_EQ(sizes_ptr[2], 3);
+TEST_F(AOTITorchEmptyStridedSlimTest, DifferentDtypes_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+  runDifferentDtypesEmptyStridedTest(
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
 }
 
-// Test different data types (currently we support bf16, fp32 and int32)
-TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) {
+// ============================================================================
+// Verify Device Properties
+// ============================================================================
+
+TEST_F(AOTITorchEmptyStridedSlimTest, VerifyCPUDevice) {
   std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
 
-  // Test float32 (dtype 6) - one of the supported types
-  Tensor* tensor_float32;
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_empty_strided(
       sizes.size(),
       sizes.data(),
-      nullptr,
-      6, // float32
-      1, // CUDA device
-      0, // device index
-      &tensor_float32);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor_float32, nullptr);
-
-  // Test int32 (dtype 3) - one of the supported types
-  Tensor* tensor_int32;
-  error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      nullptr,
-      3, // int32 - unsupported
-      1, // CUDA device
-      0, // device index
-      &tensor_int32);
+      strides.data(),
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0,
+      &tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor_int32, nullptr);
+  ASSERT_NE(tensor, nullptr);
 
-  // Test another unsupported data type
-  Tensor* tensor_float64;
-  error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      nullptr,
-      7, // float64 - unsupported
-      1, // CUDA device
-      0, // device index
-      &tensor_float64);
+  EXPECT_TRUE(tensor->is_cpu());
+  EXPECT_FALSE(tensor->is_cuda());
+  EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CPU);
 
-  EXPECT_EQ(error, Error::InvalidArgument); // Should fail for unsupported dtype
+  delete tensor;
 }
 
-// Test multi-dimensional tensors with various shapes
-TEST_F(AOTITorchEmptyStridedTest, MultiDimensionalTensors) {
-  // Test 3D tensor
-  std::vector<int64_t> sizes_3d = {2, 3, 4};
-  Tensor* tensor_3d = create_tracked_tensor(sizes_3d);
-  EXPECT_NE(tensor_3d, nullptr);
-  EXPECT_EQ(tensor_3d->dim(), 3);
-  EXPECT_EQ(tensor_3d->size(0), 2);
-  EXPECT_EQ(tensor_3d->size(1), 3);
-  EXPECT_EQ(tensor_3d->size(2), 4);
-
-  // Test 4D tensor
-  std::vector<int64_t> sizes_4d = {2, 3, 4, 5};
-  Tensor* tensor_4d = create_tracked_tensor(sizes_4d);
-  EXPECT_NE(tensor_4d, nullptr);
-  EXPECT_EQ(tensor_4d->dim(), 4);
-  EXPECT_EQ(tensor_4d->size(0), 2);
-  EXPECT_EQ(tensor_4d->size(1), 3);
-  EXPECT_EQ(tensor_4d->size(2), 4);
-  EXPECT_EQ(tensor_4d->size(3), 5);
-
-  // Test 5D tensor
-  std::vector<int64_t> sizes_5d = {1, 2, 3, 4, 5};
-  Tensor* tensor_5d = create_tracked_tensor(sizes_5d);
-  EXPECT_NE(tensor_5d, nullptr);
-  EXPECT_EQ(tensor_5d->dim(), 5);
-  EXPECT_EQ(tensor_5d->size(0), 1);
-  EXPECT_EQ(tensor_5d->size(1), 2);
-  EXPECT_EQ(tensor_5d->size(2), 3);
-  EXPECT_EQ(tensor_5d->size(3), 4);
-  EXPECT_EQ(tensor_5d->size(4), 5);
-}
+TEST_F(AOTITorchEmptyStridedSlimTest, VerifyCUDADevice) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
 
-// Test incontiguous tensor creation - transpose-like layout
-TEST_F(AOTITorchEmptyStridedTest, IncontiguousTransposeLayout) {
-  // Create a tensor with transpose-like strides (column-major)
-  // For a 3x4 tensor in column-major order, strides should be [1, 3]
-  // This means each row step is 1, and each column step is 3
-  std::vector<int64_t> sizes = {3, 4};
-  std::vector<int64_t> strides = {1, 3}; // Column-major (incontiguous)
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
 
-  Tensor* tensor;
+  Tensor* tensor = nullptr;
   AOTITorchError error = aoti_torch_empty_strided(
       sizes.size(),
       sizes.data(),
       strides.data(),
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0,
       &tensor);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor, nullptr);
+  ASSERT_NE(tensor, nullptr);
 
-  // Verify tensor properties
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 3);
-  EXPECT_EQ(tensor->size(1), 4);
+  EXPECT_FALSE(tensor->is_cpu());
+  EXPECT_TRUE(tensor->is_cuda());
+  EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CUDA);
 
-  // Verify the strides are what we specified
-  int64_t* strides_ptr;
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok);
-  EXPECT_EQ(strides_ptr[0], 1); // Column-major stride for dimension 0
-  EXPECT_EQ(strides_ptr[1], 3); // Column-major stride for dimension 1
-
-  // Verify that memory was allocated correctly for incontiguous layout
-  // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] -
-  // 1) + 1 = 1 * (3 - 1) + 3 * (4 - 1) + 1 = 1 * 2 + 3 * 3 + 1 = 2 + 9 + 1 = 12
-  // elements Total bytes = 12 * 4 = 48 bytes (for float32)
-  EXPECT_EQ(tensor->numel(), 12); // numel is still 3*4=12 for logical shape
-
-  // The tensor should be accessible and writable
-  void* data_ptr = tensor->mutable_data_ptr();
-  EXPECT_NE(data_ptr, nullptr);
-
-  // Verify we can use CUDA to write to the memory
-  std::vector<float> test_data(12, 1.0f);
-  cudaError_t cuda_err = cudaMemcpy(
-      data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice);
-  EXPECT_EQ(cuda_err, cudaSuccess);
+  delete tensor;
 }
 
-// Test incontiguous tensor creation - expanded/broadcasted stride pattern
-TEST_F(AOTITorchEmptyStridedTest, IncontiguousExpandedStrides) {
-  // Create a tensor with expanded strides (simulating broadcasting)
-  // A 2x3x4 tensor where the first dimension has stride 0 (expanded)
-  // This creates a tensor where the first dimension is "broadcasted"
-  std::vector<int64_t> sizes = {2, 3, 4};
-  std::vector<int64_t> strides = {0, 4, 1}; // First dimension has stride 0
+// ============================================================================
+// Error Cases
+// ============================================================================
+
+TEST_F(AOTITorchEmptyStridedSlimTest, NullReturnPointer) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
 
-  Tensor* tensor;
   AOTITorchError error = aoti_torch_empty_strided(
       sizes.size(),
       sizes.data(),
       strides.data(),
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0, // device index
-      &tensor);
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0,
+      nullptr); // null return pointer
 
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(tensor, nullptr);
-
-  // Verify tensor properties
-  EXPECT_EQ(tensor->dim(), 3);
-  EXPECT_EQ(tensor->size(0), 2);
-  EXPECT_EQ(tensor->size(1), 3);
-  EXPECT_EQ(tensor->size(2), 4);
-
-  // Verify the strides are what we specified
-  int64_t* strides_ptr;
-  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok);
-  EXPECT_EQ(strides_ptr[0], 0); // Expanded dimension stride
-  EXPECT_EQ(strides_ptr[1], 4);
-  EXPECT_EQ(strides_ptr[2], 1);
-
-  // Verify that memory was allocated correctly for this incontiguous layout
-  // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] -
-  // 1) + stride[2] * (size[2] - 1) + 1 = 0 * (2 - 1) + 4 * (3 - 1) + 1 * (4 -
-  // 1) + 1 = 0 + 8 + 3 + 1 = 12 elements Note: numel() returns logical number
-  // of elements (2*3*4=24), not storage size
-  EXPECT_EQ(tensor->numel(), 24); // Logical numel is 2*3*4=24
-
-  // The tensor should be accessible and writable
-  void* data_ptr = tensor->mutable_data_ptr();
-  EXPECT_NE(data_ptr, nullptr);
-
-  // Verify we can use CUDA to write to the allocated memory
-  // We only need to allocate 12 elements (storage size), not 24
-  std::vector<float> test_data(12, 2.0f);
-  cudaError_t cuda_err = cudaMemcpy(
-      data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice);
-  EXPECT_EQ(cuda_err, cudaSuccess);
+  EXPECT_EQ(error, Error::InvalidArgument);
 }
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided_slim.cpp
deleted file mode 100644
index d563eea98bc..00000000000
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided_slim.cpp
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <vector>
-
-#include <executorch/backends/aoti/slim/c10/core/Device.h>
-#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
-#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
-
-using namespace executorch::backends::cuda;
-using executorch::runtime::Error;
-
-namespace slim_c10 = executorch::backends::aoti::slim::c10;
-
-namespace {
-
-// Helper to check if CUDA is available
-bool isCudaAvailable() {
-  int device_count = 0;
-  cudaError_t err = cudaGetDeviceCount(&device_count);
-  return (err == cudaSuccess && device_count > 0);
-}
-
-// Helper to calculate contiguous strides from sizes
-std::vector<int64_t> calculateContiguousStrides(
-    const std::vector<int64_t>& sizes) {
-  std::vector<int64_t> strides(sizes.size());
-  if (sizes.empty()) {
-    return strides;
-  }
-  strides[sizes.size() - 1] = 1;
-  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-    strides[i] = strides[i + 1] * sizes[i + 1];
-  }
-  return strides;
-}
-
-} // namespace
-
-// Test fixture for SlimTensor-based aoti_torch_empty_strided tests
-class AOTITorchEmptyStridedSlimTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    et_pal_init();
-  }
-
-  void TearDown() override {
-    // Tensors are cleaned up via their destructors
-    for (Tensor* t : tensors_) {
-      delete t;
-    }
-    tensors_.clear();
-  }
-
-  // Track tensors for cleanup
-  void trackTensor(Tensor* t) {
-    if (t != nullptr) {
-      tensors_.push_back(t);
-    }
-  }
-
- private:
-  std::vector<Tensor*> tensors_;
-};
-
-// ============================================================================
-// Common test body - parameterized by device type
-// ============================================================================
-
-void runBasicEmptyStridedTest(int32_t device_type, int32_t device_index) {
-  // Test 1D tensor
-  std::vector<int64_t> sizes_1d = {5};
-  std::vector<int64_t> strides_1d = calculateContiguousStrides(sizes_1d);
-
-  Tensor* tensor_1d = nullptr;
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes_1d.size(),
-      sizes_1d.data(),
-      strides_1d.data(),
-      static_cast<int32_t>(slim_c10::ScalarType::Float), // dtype = 6
-      device_type,
-      device_index,
-      &tensor_1d);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor_1d, nullptr);
-
-  // Check tensor properties
-  EXPECT_EQ(tensor_1d->dim(), 1);
-  EXPECT_EQ(tensor_1d->size(0), 5);
-  EXPECT_EQ(tensor_1d->numel(), 5);
-  EXPECT_EQ(
-      static_cast<int32_t>(tensor_1d->dtype()),
-      static_cast<int32_t>(slim_c10::ScalarType::Float));
-  EXPECT_NE(tensor_1d->data_ptr(), nullptr);
-
-  // Cleanup
-  delete tensor_1d;
-}
-
-void runMultiDimensionalEmptyStridedTest(
-    int32_t device_type,
-    int32_t device_index) {
-  // Test 3D tensor
-  std::vector<int64_t> sizes = {2, 3, 4};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      device_type,
-      device_index,
-      &tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  // Check tensor properties
-  EXPECT_EQ(tensor->dim(), 3);
-  EXPECT_EQ(tensor->size(0), 2);
-  EXPECT_EQ(tensor->size(1), 3);
-  EXPECT_EQ(tensor->size(2), 4);
-  EXPECT_EQ(tensor->numel(), 24);
-
-  // Check strides
-  EXPECT_EQ(tensor->stride(0), 12);
-  EXPECT_EQ(tensor->stride(1), 4);
-  EXPECT_EQ(tensor->stride(2), 1);
-
-  delete tensor;
-}
-
-void runScalarTensorEmptyStridedTest(
-    int32_t device_type,
-    int32_t device_index) {
-  std::vector<int64_t> sizes = {}; // 0D tensor
-  std::vector<int64_t> strides = {};
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      device_type,
-      device_index,
-      &tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->dim(), 0);
-  EXPECT_EQ(tensor->numel(), 1);
-  EXPECT_NE(tensor->data_ptr(), nullptr);
-
-  delete tensor;
-}
-
-void runZeroSizedTensorEmptyStridedTest(
-    int32_t device_type,
-    int32_t device_index) {
-  std::vector<int64_t> sizes = {0, 5};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      device_type,
-      device_index,
-      &tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 0);
-  EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->numel(), 0);
-
-  delete tensor;
-}
-
-void runCustomStridesEmptyStridedTest(
-    int32_t device_type,
-    int32_t device_index) {
-  // Create a transposed (column-major) tensor
-  std::vector<int64_t> sizes = {3, 4};
-  std::vector<int64_t> strides = {1, 3}; // Column-major
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      device_type,
-      device_index,
-      &tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 3);
-  EXPECT_EQ(tensor->size(1), 4);
-  EXPECT_EQ(tensor->stride(0), 1);
-  EXPECT_EQ(tensor->stride(1), 3);
-
-  // Non-contiguous due to custom strides
-  EXPECT_FALSE(tensor->is_contiguous());
-
-  delete tensor;
-}
-
-void runDifferentDtypesEmptyStridedTest(
-    int32_t device_type,
-    int32_t device_index) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  // Test Float32
-  {
-    Tensor* tensor = nullptr;
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        strides.data(),
-        static_cast<int32_t>(slim_c10::ScalarType::Float),
-        device_type,
-        device_index,
-        &tensor);
-    EXPECT_EQ(error, Error::Ok);
-    ASSERT_NE(tensor, nullptr);
-    EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Float);
-    EXPECT_EQ(tensor->itemsize(), 4);
-    delete tensor;
-  }
-
-  // Test BFloat16
-  {
-    Tensor* tensor = nullptr;
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        strides.data(),
-        static_cast<int32_t>(slim_c10::ScalarType::BFloat16),
-        device_type,
-        device_index,
-        &tensor);
-    EXPECT_EQ(error, Error::Ok);
-    ASSERT_NE(tensor, nullptr);
-    EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::BFloat16);
-    EXPECT_EQ(tensor->itemsize(), 2);
-    delete tensor;
-  }
-
-  // Test Int64
-  {
-    Tensor* tensor = nullptr;
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        strides.data(),
-        static_cast<int32_t>(slim_c10::ScalarType::Long),
-        device_type,
-        device_index,
-        &tensor);
-    EXPECT_EQ(error, Error::Ok);
-    ASSERT_NE(tensor, nullptr);
-    EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Long);
-    EXPECT_EQ(tensor->itemsize(), 8);
-    delete tensor;
-  }
-
-  // Test Bool
-  {
-    Tensor* tensor = nullptr;
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        strides.data(),
-        static_cast<int32_t>(slim_c10::ScalarType::Bool),
-        device_type,
-        device_index,
-        &tensor);
-    EXPECT_EQ(error, Error::Ok);
-    ASSERT_NE(tensor, nullptr);
-    EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Bool);
-    EXPECT_EQ(tensor->itemsize(), 1);
-    delete tensor;
-  }
-}
-
-// ============================================================================
-// CPU Tests
-// ============================================================================
-
-TEST_F(AOTITorchEmptyStridedSlimTest, BasicFunctionality_CPU) {
-  runBasicEmptyStridedTest(static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-}
-
-TEST_F(AOTITorchEmptyStridedSlimTest, MultiDimensional_CPU) {
-  runMultiDimensionalEmptyStridedTest(
-      static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-}
-
-TEST_F(AOTITorchEmptyStridedSlimTest, ScalarTensor_CPU) {
-  runScalarTensorEmptyStridedTest(
-      static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-}
-
-TEST_F(AOTITorchEmptyStridedSlimTest, ZeroSizedTensor_CPU) {
-  runZeroSizedTensorEmptyStridedTest(
-      static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-}
-
-TEST_F(AOTITorchEmptyStridedSlimTest, CustomStrides_CPU) {
-  runCustomStridesEmptyStridedTest(
-      static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-}
-
-TEST_F(AOTITorchEmptyStridedSlimTest, DifferentDtypes_CPU) {
-  runDifferentDtypesEmptyStridedTest(
-      static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-}
-
-// ============================================================================
-// CUDA Tests
-// ============================================================================
-
-TEST_F(AOTITorchEmptyStridedSlimTest, BasicFunctionality_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runBasicEmptyStridedTest(static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-}
-
-TEST_F(AOTITorchEmptyStridedSlimTest, MultiDimensional_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runMultiDimensionalEmptyStridedTest(
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-}
-
-TEST_F(AOTITorchEmptyStridedSlimTest, ScalarTensor_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runScalarTensorEmptyStridedTest(
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-}
-
-TEST_F(AOTITorchEmptyStridedSlimTest, ZeroSizedTensor_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runZeroSizedTensorEmptyStridedTest(
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-}
-
-TEST_F(AOTITorchEmptyStridedSlimTest, CustomStrides_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runCustomStridesEmptyStridedTest(
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-}
-
-TEST_F(AOTITorchEmptyStridedSlimTest, DifferentDtypes_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-  runDifferentDtypesEmptyStridedTest(
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-}
-
-// ============================================================================
-// Verify Device Properties
-// ============================================================================
-
-TEST_F(AOTITorchEmptyStridedSlimTest, VerifyCPUDevice) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0,
-      &tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_TRUE(tensor->is_cpu());
-  EXPECT_FALSE(tensor->is_cuda());
-  EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CPU);
-
-  delete tensor;
-}
-
-TEST_F(AOTITorchEmptyStridedSlimTest, VerifyCUDADevice) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  Tensor* tensor = nullptr;
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0,
-      &tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(tensor, nullptr);
-
-  EXPECT_FALSE(tensor->is_cpu());
-  EXPECT_TRUE(tensor->is_cuda());
-  EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CUDA);
-
-  delete tensor;
-}
-
-// ============================================================================
-// Error Cases
-// ============================================================================
-
-TEST_F(AOTITorchEmptyStridedSlimTest, NullReturnPointer) {
-  std::vector<int64_t> sizes = {2, 3};
-  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
-
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0,
-      nullptr); // null return pointer
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool.cpp
index 8e6bcbbfad6..dee95cbafe2 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool.cpp
@@ -7,197 +7,285 @@
  */
 
 #include <cuda_runtime.h>
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/cuda/runtime/shims/memory.h>
-#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/utils.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
 #include <vector>
 
-using namespace executorch::backends::aoti;
-using namespace executorch::backends::cuda;
-using namespace executorch::runtime;
-using executorch::runtime::etensor::Tensor;
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
+#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
 
-// Test fixture for aoti_torch_item_bool tests
-class AOTITorchItemBoolTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Initialize ExecuTorch Platform Abstraction Layer
-    et_pal_init();
+using namespace executorch::backends::cuda;
+using executorch::runtime::Error;
 
-    // Check if CUDA is available
-    int device_count = 0;
-    cudaError_t err = cudaGetDeviceCount(&device_count);
-    if (err != cudaSuccess || device_count == 0) {
-      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
-    }
+namespace slim_c10 = executorch::backends::aoti::slim::c10;
 
-    // Clean up any existing cached metadata before each test
-    cleanup_tensor_metadata();
+namespace {
 
-    // Clear any remaining tensors from previous tests
-    clear_all_tensors();
-  }
+bool isCudaAvailable() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  return (err == cudaSuccess && device_count > 0);
+}
 
-  void TearDown() override {
-    // Clean up metadata
-    cleanup_tensor_metadata();
+} // namespace
 
-    // Clear the global tensor storage using the provided function
-    clear_all_tensors();
+class AOTITorchItemBoolSlimTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
   }
 
-  // Helper to create a bool tensor on CUDA with a specific value
-  Tensor* create_cuda_bool_tensor(bool value) {
-    // Create a 0D (scalar) bool tensor
-    std::vector<int64_t> sizes = {}; // 0D tensor
-    std::vector<int64_t> strides = {}; // Empty strides for scalar
-    Tensor* tensor;
+  Tensor* createScalarBoolTensor(
+      bool value,
+      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      int32_t device_index = 0) {
+    Tensor* tensor = nullptr;
+
+    std::vector<int64_t> sizes = {1};
+    std::vector<int64_t> strides = {1};
 
     AOTITorchError error = aoti_torch_empty_strided(
         sizes.size(),
         sizes.data(),
         strides.data(),
-        static_cast<int32_t>(SupportedDTypes::BOOL),
-        static_cast<int32_t>(SupportedDevices::CUDA),
-        0,
+        static_cast<int32_t>(slim_c10::ScalarType::Bool),
+        device_type,
+        device_index,
         &tensor);
 
     if (error != Error::Ok || tensor == nullptr) {
       return nullptr;
     }
 
-    // Set the value
-    bool host_value = value;
-    cudaError_t cuda_err = cudaMemcpy(
-        tensor->mutable_data_ptr(),
-        &host_value,
-        sizeof(bool),
-        cudaMemcpyHostToDevice);
-
-    if (cuda_err != cudaSuccess) {
-      aoti_torch_delete_tensor_object(tensor);
-      return nullptr;
+    if (device_type == static_cast<int32_t>(slim_c10::DeviceType::CPU)) {
+      bool* data = static_cast<bool*>(tensor->data_ptr());
+      *data = value;
+    } else {
+      cudaMemcpy(
+          tensor->data_ptr(), &value, sizeof(bool), cudaMemcpyHostToDevice);
     }
 
     return tensor;
   }
 
-  // Helper to create a bool tensor on CPU with a specific value
-  Tensor* create_cpu_bool_tensor(bool value) {
-    // Create a 0D (scalar) bool tensor
-    std::vector<int64_t> sizes = {}; // 0D tensor
-    std::vector<int64_t> strides = {}; // Empty strides for scalar
-    Tensor* tensor;
+  Tensor* createTestTensor(
+      const std::vector<int64_t>& sizes,
+      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
+      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      int32_t device_index = 0) {
+    Tensor* tensor = nullptr;
+
+    std::vector<int64_t> strides(sizes.size());
+    if (!sizes.empty()) {
+      strides[sizes.size() - 1] = 1;
+      for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+        strides[i] = strides[i + 1] * sizes[i + 1];
+      }
+    }
 
     AOTITorchError error = aoti_torch_empty_strided(
         sizes.size(),
         sizes.data(),
         strides.data(),
-        static_cast<int32_t>(SupportedDTypes::BOOL),
-        static_cast<int32_t>(SupportedDevices::CPU),
-        0,
+        dtype,
+        device_type,
+        device_index,
         &tensor);
 
-    if (error != Error::Ok || tensor == nullptr) {
-      return nullptr;
-    }
-
-    // Set the value directly
-    bool* data_ptr = static_cast<bool*>(tensor->mutable_data_ptr());
-    *data_ptr = value;
-
-    return tensor;
+    return (error == Error::Ok) ? tensor : nullptr;
   }
 };
 
-// Test extracting true value from CUDA bool tensor
-TEST_F(AOTITorchItemBoolTest, CUDATensorTrueValue) {
-  Tensor* tensor = create_cuda_bool_tensor(true);
+// ============================================================================
+// Basic Functionality Tests
+// ============================================================================
+
+TEST_F(AOTITorchItemBoolSlimTest, TrueValue_CPU) {
+  Tensor* tensor = createScalarBoolTensor(
+      true, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
   ASSERT_NE(tensor, nullptr);
 
   bool result = false;
   AOTITorchError error = aoti_torch_item_bool(tensor, &result);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_TRUE(result);
+  EXPECT_EQ(result, true);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
 }
 
-// Test extracting false value from CUDA bool tensor
-TEST_F(AOTITorchItemBoolTest, CUDATensorFalseValue) {
-  Tensor* tensor = create_cuda_bool_tensor(false);
+TEST_F(AOTITorchItemBoolSlimTest, FalseValue_CPU) {
+  Tensor* tensor = createScalarBoolTensor(
+      false, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
   ASSERT_NE(tensor, nullptr);
 
   bool result = true;
   AOTITorchError error = aoti_torch_item_bool(tensor, &result);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_FALSE(result);
+  EXPECT_EQ(result, false);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
+}
+
+// ============================================================================
+// Error Handling Tests
+// ============================================================================
+
+TEST_F(AOTITorchItemBoolSlimTest, NullTensor) {
+  bool result = false;
+  AOTITorchError error = aoti_torch_item_bool(nullptr, &result);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchItemBoolSlimTest, NullReturnValue) {
+  Tensor* tensor = createScalarBoolTensor(
+      true, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+  ASSERT_NE(tensor, nullptr);
+
+  AOTITorchError error = aoti_torch_item_bool(tensor, nullptr);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
+}
+
+TEST_F(AOTITorchItemBoolSlimTest, MultiElementTensor) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* tensor = createTestTensor(
+      sizes,
+      static_cast<int32_t>(slim_c10::ScalarType::Bool),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(tensor, nullptr);
+  EXPECT_GT(tensor->numel(), 1);
+
+  bool result = false;
+  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
+}
+
+TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Float) {
+  std::vector<int64_t> sizes = {1};
+  Tensor* tensor = createTestTensor(
+      sizes,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(tensor, nullptr);
+
+  bool result = false;
+  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
+}
+
+TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Long) {
+  std::vector<int64_t> sizes = {1};
+  Tensor* tensor = createTestTensor(
+      sizes,
+      static_cast<int32_t>(slim_c10::ScalarType::Long),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(tensor, nullptr);
+
+  bool result = false;
+  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
 }
 
-// Test extracting true value from CPU bool tensor
-TEST_F(AOTITorchItemBoolTest, CPUTensorTrueValue) {
-  Tensor* tensor = create_cpu_bool_tensor(true);
+// ============================================================================
+// CUDA Tests
+// ============================================================================
+
+TEST_F(AOTITorchItemBoolSlimTest, TrueValue_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  Tensor* tensor = createScalarBoolTensor(
+      true, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
   ASSERT_NE(tensor, nullptr);
+  EXPECT_TRUE(tensor->is_cuda());
 
   bool result = false;
   AOTITorchError error = aoti_torch_item_bool(tensor, &result);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_TRUE(result);
+  EXPECT_EQ(result, true);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
 }
 
-// Test extracting false value from CPU bool tensor
-TEST_F(AOTITorchItemBoolTest, CPUTensorFalseValue) {
-  Tensor* tensor = create_cpu_bool_tensor(false);
+TEST_F(AOTITorchItemBoolSlimTest, FalseValue_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  Tensor* tensor = createScalarBoolTensor(
+      false, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
   ASSERT_NE(tensor, nullptr);
+  EXPECT_TRUE(tensor->is_cuda());
 
   bool result = true;
   AOTITorchError error = aoti_torch_item_bool(tensor, &result);
 
   EXPECT_EQ(error, Error::Ok);
-  EXPECT_FALSE(result);
-}
+  EXPECT_EQ(result, false);
 
-// Test with null tensor pointer
-TEST_F(AOTITorchItemBoolTest, NullTensorPointer) {
-  bool result;
-  AOTITorchError error = aoti_torch_item_bool(nullptr, &result);
-  EXPECT_EQ(error, Error::InvalidArgument);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
 }
 
-// Test with null result pointer
-TEST_F(AOTITorchItemBoolTest, NullResultPointer) {
-  Tensor* tensor = create_cuda_bool_tensor(true);
+TEST_F(AOTITorchItemBoolSlimTest, MultiElementTensor_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* tensor = createTestTensor(
+      sizes,
+      static_cast<int32_t>(slim_c10::ScalarType::Bool),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
   ASSERT_NE(tensor, nullptr);
+  EXPECT_TRUE(tensor->is_cuda());
+
+  bool result = false;
+  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
 
-  AOTITorchError error = aoti_torch_item_bool(tensor, nullptr);
   EXPECT_EQ(error, Error::InvalidArgument);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
 }
 
-// Test with non-bool dtype (should fail)
-TEST_F(AOTITorchItemBoolTest, NonBoolDtype) {
-  // Create a float tensor
-  std::vector<int64_t> sizes = {};
-  std::vector<int64_t> strides = {};
-  Tensor* tensor;
-
-  AOTITorchError error = aoti_torch_empty_strided(
-      sizes.size(),
-      sizes.data(),
-      strides.data(),
-      static_cast<int32_t>(SupportedDTypes::FLOAT32), // Not bool
-      static_cast<int32_t>(SupportedDevices::CUDA),
-      0,
-      &tensor);
-
-  ASSERT_EQ(error, Error::Ok);
+TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Float_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  std::vector<int64_t> sizes = {1};
+  Tensor* tensor = createTestTensor(
+      sizes,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
   ASSERT_NE(tensor, nullptr);
 
-  bool result;
-  error = aoti_torch_item_bool(tensor, &result);
+  bool result = false;
+  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
+
   EXPECT_EQ(error, Error::InvalidArgument);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
 }
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool_slim.cpp
deleted file mode 100644
index dee95cbafe2..00000000000
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool_slim.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <vector>
-
-#include <executorch/backends/aoti/slim/c10/core/Device.h>
-#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
-#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
-
-using namespace executorch::backends::cuda;
-using executorch::runtime::Error;
-
-namespace slim_c10 = executorch::backends::aoti::slim::c10;
-
-namespace {
-
-bool isCudaAvailable() {
-  int device_count = 0;
-  cudaError_t err = cudaGetDeviceCount(&device_count);
-  return (err == cudaSuccess && device_count > 0);
-}
-
-} // namespace
-
-class AOTITorchItemBoolSlimTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    et_pal_init();
-  }
-
-  Tensor* createScalarBoolTensor(
-      bool value,
-      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      int32_t device_index = 0) {
-    Tensor* tensor = nullptr;
-
-    std::vector<int64_t> sizes = {1};
-    std::vector<int64_t> strides = {1};
-
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        strides.data(),
-        static_cast<int32_t>(slim_c10::ScalarType::Bool),
-        device_type,
-        device_index,
-        &tensor);
-
-    if (error != Error::Ok || tensor == nullptr) {
-      return nullptr;
-    }
-
-    if (device_type == static_cast<int32_t>(slim_c10::DeviceType::CPU)) {
-      bool* data = static_cast<bool*>(tensor->data_ptr());
-      *data = value;
-    } else {
-      cudaMemcpy(
-          tensor->data_ptr(), &value, sizeof(bool), cudaMemcpyHostToDevice);
-    }
-
-    return tensor;
-  }
-
-  Tensor* createTestTensor(
-      const std::vector<int64_t>& sizes,
-      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
-      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      int32_t device_index = 0) {
-    Tensor* tensor = nullptr;
-
-    std::vector<int64_t> strides(sizes.size());
-    if (!sizes.empty()) {
-      strides[sizes.size() - 1] = 1;
-      for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-        strides[i] = strides[i + 1] * sizes[i + 1];
-      }
-    }
-
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        strides.data(),
-        dtype,
-        device_type,
-        device_index,
-        &tensor);
-
-    return (error == Error::Ok) ? tensor : nullptr;
-  }
-};
-
-// ============================================================================
-// Basic Functionality Tests
-// ============================================================================
-
-TEST_F(AOTITorchItemBoolSlimTest, TrueValue_CPU) {
-  Tensor* tensor = createScalarBoolTensor(
-      true, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-  ASSERT_NE(tensor, nullptr);
-
-  bool result = false;
-  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(result, true);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchItemBoolSlimTest, FalseValue_CPU) {
-  Tensor* tensor = createScalarBoolTensor(
-      false, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-  ASSERT_NE(tensor, nullptr);
-
-  bool result = true;
-  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(result, false);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-}
-
-// ============================================================================
-// Error Handling Tests
-// ============================================================================
-
-TEST_F(AOTITorchItemBoolSlimTest, NullTensor) {
-  bool result = false;
-  AOTITorchError error = aoti_torch_item_bool(nullptr, &result);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
-
-TEST_F(AOTITorchItemBoolSlimTest, NullReturnValue) {
-  Tensor* tensor = createScalarBoolTensor(
-      true, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
-  ASSERT_NE(tensor, nullptr);
-
-  AOTITorchError error = aoti_torch_item_bool(tensor, nullptr);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchItemBoolSlimTest, MultiElementTensor) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* tensor = createTestTensor(
-      sizes,
-      static_cast<int32_t>(slim_c10::ScalarType::Bool),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(tensor, nullptr);
-  EXPECT_GT(tensor->numel(), 1);
-
-  bool result = false;
-  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Float) {
-  std::vector<int64_t> sizes = {1};
-  Tensor* tensor = createTestTensor(
-      sizes,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(tensor, nullptr);
-
-  bool result = false;
-  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Long) {
-  std::vector<int64_t> sizes = {1};
-  Tensor* tensor = createTestTensor(
-      sizes,
-      static_cast<int32_t>(slim_c10::ScalarType::Long),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(tensor, nullptr);
-
-  bool result = false;
-  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-}
-
-// ============================================================================
-// CUDA Tests
-// ============================================================================
-
-TEST_F(AOTITorchItemBoolSlimTest, TrueValue_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  Tensor* tensor = createScalarBoolTensor(
-      true, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-  ASSERT_NE(tensor, nullptr);
-  EXPECT_TRUE(tensor->is_cuda());
-
-  bool result = false;
-  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(result, true);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchItemBoolSlimTest, FalseValue_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  Tensor* tensor = createScalarBoolTensor(
-      false, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
-  ASSERT_NE(tensor, nullptr);
-  EXPECT_TRUE(tensor->is_cuda());
-
-  bool result = true;
-  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_EQ(result, false);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchItemBoolSlimTest, MultiElementTensor_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* tensor = createTestTensor(
-      sizes,
-      static_cast<int32_t>(slim_c10::ScalarType::Bool),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(tensor, nullptr);
-  EXPECT_TRUE(tensor->is_cuda());
-
-  bool result = false;
-  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Float_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {1};
-  Tensor* tensor = createTestTensor(
-      sizes,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(tensor, nullptr);
-
-  bool result = false;
-  AOTITorchError error = aoti_torch_item_bool(tensor, &result);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok);
-}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle.cpp
index d123443cbfa..3a1de152f0b 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle.cpp
@@ -7,64 +7,70 @@
  */
 
 #include <cuda_runtime.h>
-#include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/cuda/runtime/shims/memory.h>
-#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/utils.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
 #include <vector>
 
-using namespace executorch::backends::aoti;
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
+#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+
 using namespace executorch::backends::cuda;
-using namespace executorch::runtime;
-using executorch::runtime::etensor::Tensor;
+using executorch::runtime::Error;
 
-// Test fixture for aoti_torch_new_tensor_handle tests
-class AOTITorchNewTensorHandleTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Initialize ExecuTorch Platform Abstraction Layer
-    et_pal_init();
+namespace slim_c10 = executorch::backends::aoti::slim::c10;
 
-    // Check if CUDA is available
-    int device_count = 0;
-    cudaError_t err = cudaGetDeviceCount(&device_count);
-    if (err != cudaSuccess || device_count == 0) {
-      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
-    }
+namespace {
+
+bool isCudaAvailable() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  return (err == cudaSuccess && device_count > 0);
+}
 
-    // Clean up any existing cached metadata before each test
-    cleanup_tensor_metadata();
+std::vector<int64_t> calculateContiguousStrides(
+    const std::vector<int64_t>& sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  if (sizes.empty()) {
+    return strides;
+  }
+  strides[sizes.size() - 1] = 1;
+  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * sizes[i + 1];
+  }
+  return strides;
+}
 
-    // Clear any remaining tensors from previous tests
-    clear_all_tensors();
+} // namespace
+
+class AOTITorchNewTensorHandleSlimTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
   }
 
   void TearDown() override {
-    // Clean up metadata
-    cleanup_tensor_metadata();
-
-    // Clear the global tensor storage using the provided function
-    clear_all_tensors();
+    // SlimTensor uses automatic reference counting - no manual cleanup needed
   }
 
-  // Helper to create test tensors
-  Tensor* create_test_tensor(
+  Tensor* createTestTensor(
       const std::vector<int64_t>& sizes,
       const std::vector<int64_t>& strides = {},
-      int32_t dtype = static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      int32_t device_type = static_cast<int32_t>(SupportedDevices::CUDA),
+      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
+      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
       int32_t device_index = 0) {
-    Tensor* tensor;
+    Tensor* tensor = nullptr;
 
-    const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data();
+    std::vector<int64_t> effective_strides = strides;
+    if (strides.empty()) {
+      effective_strides = calculateContiguousStrides(sizes);
+    }
 
     AOTITorchError error = aoti_torch_empty_strided(
         sizes.size(),
         sizes.data(),
-        strides_ptr,
+        effective_strides.data(),
         dtype,
         device_type,
         device_index,
@@ -74,97 +80,106 @@ class AOTITorchNewTensorHandleTest : public ::testing::Test {
   }
 };
 
-// Test basic functionality of creating a new tensor handle
-TEST_F(AOTITorchNewTensorHandleTest, BasicFunctionality) {
-  // Create an original tensor
+// ============================================================================
+// Basic Functionality Tests
+// ============================================================================
+
+TEST_F(AOTITorchNewTensorHandleSlimTest, BasicFunctionality_CPU) {
   std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = create_test_tensor(sizes);
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
-  // Create a new handle from the original tensor
   Tensor* new_tensor;
   AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
 
   EXPECT_EQ(error, Error::Ok);
   EXPECT_NE(new_tensor, nullptr);
 
-  // Verify the new tensor has the same properties
   EXPECT_EQ(new_tensor->dim(), orig_tensor->dim());
   EXPECT_EQ(new_tensor->size(0), orig_tensor->size(0));
   EXPECT_EQ(new_tensor->size(1), orig_tensor->size(1));
   EXPECT_EQ(new_tensor->numel(), orig_tensor->numel());
 
-  // Verify they share the same memory
-  EXPECT_EQ(new_tensor->mutable_data_ptr(), orig_tensor->mutable_data_ptr());
+  EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr());
 
-  // Clean up
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
   EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
 }
 
-// Test creating new handle from null tensor
-TEST_F(AOTITorchNewTensorHandleTest, NullOriginalTensor) {
+TEST_F(AOTITorchNewTensorHandleSlimTest, NullOriginalTensor) {
   Tensor* new_tensor;
   AOTITorchError error = aoti_torch_new_tensor_handle(nullptr, &new_tensor);
 
   EXPECT_EQ(error, Error::InvalidArgument);
 }
 
-// Test passing null pointer for new handle
-TEST_F(AOTITorchNewTensorHandleTest, NullNewHandle) {
+TEST_F(AOTITorchNewTensorHandleSlimTest, NullNewHandle) {
   std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = create_test_tensor(sizes);
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
   AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, nullptr);
 
   EXPECT_EQ(error, Error::InvalidArgument);
 
-  // Clean up
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
 }
 
-// Test memory sharing between original and new tensor handle
-TEST_F(AOTITorchNewTensorHandleTest, MemorySharing) {
-  // Create an original tensor
+// ============================================================================
+// Memory Sharing Tests
+// ============================================================================
+
+TEST_F(AOTITorchNewTensorHandleSlimTest, MemorySharing_CPU) {
   std::vector<int64_t> sizes = {3, 4};
-  Tensor* orig_tensor = create_test_tensor(sizes);
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
-  // Get original memory pointer
-  void* orig_ptr = orig_tensor->mutable_data_ptr();
+  void* orig_ptr = orig_tensor->data_ptr();
   ASSERT_NE(orig_ptr, nullptr);
 
-  // Create a new handle
   Tensor* new_tensor;
   AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
   EXPECT_EQ(error, Error::Ok);
   ASSERT_NE(new_tensor, nullptr);
 
-  // Verify both tensors point to the same memory
-  void* new_ptr = new_tensor->mutable_data_ptr();
+  void* new_ptr = new_tensor->data_ptr();
   EXPECT_EQ(orig_ptr, new_ptr);
 
-  // Clean up - deleting one should not affect the other's validity
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
 
-  // New tensor should still be valid and accessible
-  void* still_valid_ptr = new_tensor->mutable_data_ptr();
+  void* still_valid_ptr = new_tensor->data_ptr();
   EXPECT_EQ(still_valid_ptr, new_ptr);
 
   EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
 }
 
-// Test creating multiple handles from the same tensor
-TEST_F(AOTITorchNewTensorHandleTest, MultipleHandles) {
-  // Create an original tensor
+TEST_F(AOTITorchNewTensorHandleSlimTest, MultipleHandles_CPU) {
   std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = create_test_tensor(sizes);
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
-  void* orig_ptr = orig_tensor->mutable_data_ptr();
+  void* orig_ptr = orig_tensor->data_ptr();
 
-  // Create multiple handles
   std::vector<Tensor*> handles;
   const int num_handles = 5;
 
@@ -174,246 +189,165 @@ TEST_F(AOTITorchNewTensorHandleTest, MultipleHandles) {
         aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
     EXPECT_EQ(error, Error::Ok);
     ASSERT_NE(new_tensor, nullptr);
-    EXPECT_EQ(new_tensor->mutable_data_ptr(), orig_ptr);
+    EXPECT_EQ(new_tensor->data_ptr(), orig_ptr);
     handles.push_back(new_tensor);
   }
 
-  // Delete original tensor
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
 
-  // All handles should still be valid
   for (Tensor* handle : handles) {
-    EXPECT_EQ(handle->mutable_data_ptr(), orig_ptr);
+    EXPECT_EQ(handle->data_ptr(), orig_ptr);
     EXPECT_EQ(handle->dim(), 2);
     EXPECT_EQ(handle->size(0), 2);
     EXPECT_EQ(handle->size(1), 3);
   }
 
-  // Delete all handles
   for (Tensor* handle : handles) {
     EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok);
   }
 }
 
-// Test creating handle from tensor with custom strides
-TEST_F(AOTITorchNewTensorHandleTest, CustomStrides) {
+// ============================================================================
+// Tensor Property Tests
+// ============================================================================
+
+TEST_F(AOTITorchNewTensorHandleSlimTest, CustomStrides_CPU) {
   std::vector<int64_t> sizes = {3, 4};
   std::vector<int64_t> strides = {4, 1}; // Row-major strides
-  Tensor* orig_tensor = create_test_tensor(sizes, strides);
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      strides,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
-  // Create new handle
   Tensor* new_tensor;
   AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
   EXPECT_EQ(error, Error::Ok);
   ASSERT_NE(new_tensor, nullptr);
 
-  // Verify strides are preserved
-  int64_t* orig_strides_ptr;
-  int64_t* new_strides_ptr;
-  EXPECT_EQ(aoti_torch_get_strides(orig_tensor, &orig_strides_ptr), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_strides(new_tensor, &new_strides_ptr), Error::Ok);
+  EXPECT_EQ(orig_tensor->stride(0), new_tensor->stride(0));
+  EXPECT_EQ(orig_tensor->stride(1), new_tensor->stride(1));
 
-  EXPECT_EQ(orig_strides_ptr[0], new_strides_ptr[0]);
-  EXPECT_EQ(orig_strides_ptr[1], new_strides_ptr[1]);
-
-  // Clean up
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
   EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
 }
 
-// Test creating handle from bfloat16 tensor
-TEST_F(AOTITorchNewTensorHandleTest, BFloat16Tensor) {
+TEST_F(AOTITorchNewTensorHandleSlimTest, BFloat16Tensor_CPU) {
   std::vector<int64_t> sizes = {2, 3, 4};
-  Tensor* orig_tensor = create_test_tensor(
+  Tensor* orig_tensor = createTestTensor(
       sizes,
       {},
-      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
-      static_cast<int32_t>(SupportedDevices::CUDA));
+      static_cast<int32_t>(slim_c10::ScalarType::BFloat16),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
-  // Verify original is bfloat16
-  int32_t orig_dtype;
-  EXPECT_EQ(aoti_torch_get_dtype(orig_tensor, &orig_dtype), Error::Ok);
-  EXPECT_EQ(orig_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16));
-
-  // Create new handle
   Tensor* new_tensor;
   AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
   EXPECT_EQ(error, Error::Ok);
   ASSERT_NE(new_tensor, nullptr);
 
-  // Verify new tensor is also bfloat16
-  int32_t new_dtype;
-  EXPECT_EQ(aoti_torch_get_dtype(new_tensor, &new_dtype), Error::Ok);
-  EXPECT_EQ(new_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16));
-
-  // Verify element size (bfloat16 should be 2 bytes)
-  EXPECT_EQ(new_tensor->element_size(), 2);
+  EXPECT_EQ(new_tensor->itemsize(), 2);
 
-  // Clean up
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
   EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
 }
 
-// Test creating handle from scalar (0D) tensor
-TEST_F(AOTITorchNewTensorHandleTest, ScalarTensor) {
+TEST_F(AOTITorchNewTensorHandleSlimTest, ScalarTensor_CPU) {
   std::vector<int64_t> sizes = {};
-  Tensor* orig_tensor = create_test_tensor(sizes);
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
   EXPECT_EQ(orig_tensor->dim(), 0);
 
-  // Create new handle
   Tensor* new_tensor;
   AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
   EXPECT_EQ(error, Error::Ok);
   ASSERT_NE(new_tensor, nullptr);
 
-  // Verify scalar properties
   EXPECT_EQ(new_tensor->dim(), 0);
   EXPECT_EQ(new_tensor->numel(), 1);
 
-  // Clean up
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
   EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
 }
 
-// Test creating handle from zero-sized tensor
-TEST_F(AOTITorchNewTensorHandleTest, ZeroSizedTensor) {
-  std::vector<int64_t> sizes = {0, 5};
-  Tensor* orig_tensor = create_test_tensor(sizes);
-  ASSERT_NE(orig_tensor, nullptr);
-  EXPECT_EQ(orig_tensor->numel(), 0);
-
-  // Attempt to create new handle - should fail because zero-sized tensors have
-  // null data pointers
-  Tensor* new_tensor = nullptr;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-
-  // Zero-sized tensors are not currently supported
-  EXPECT_EQ(error, Error::InvalidArgument);
-  EXPECT_EQ(new_tensor, nullptr);
-
-  // Clean up original tensor
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-}
-
-// Test creating handle from large multi-dimensional tensor
-TEST_F(AOTITorchNewTensorHandleTest, LargeMultiDimensionalTensor) {
+TEST_F(AOTITorchNewTensorHandleSlimTest, LargeMultiDimensionalTensor_CPU) {
   std::vector<int64_t> sizes = {10, 20, 30};
-  Tensor* orig_tensor = create_test_tensor(sizes);
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
-  // Create new handle
   Tensor* new_tensor;
   AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
   EXPECT_EQ(error, Error::Ok);
   ASSERT_NE(new_tensor, nullptr);
 
-  // Verify dimensions
   EXPECT_EQ(new_tensor->dim(), 3);
   EXPECT_EQ(new_tensor->size(0), 10);
   EXPECT_EQ(new_tensor->size(1), 20);
   EXPECT_EQ(new_tensor->size(2), 30);
   EXPECT_EQ(new_tensor->numel(), 6000);
 
-  // Clean up
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
   EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
 }
 
-// Test creating handle preserves tensor metadata
-TEST_F(AOTITorchNewTensorHandleTest, MetadataPreservation) {
-  std::vector<int64_t> sizes = {2, 3, 4};
-  std::vector<int64_t> strides = {12, 4, 1};
-  Tensor* orig_tensor = create_test_tensor(
-      sizes,
-      strides,
-      static_cast<int32_t>(SupportedDTypes::FLOAT32),
-      static_cast<int32_t>(SupportedDevices::CUDA));
-  ASSERT_NE(orig_tensor, nullptr);
-
-  // Create new handle
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(new_tensor, nullptr);
-
-  // Get and compare all metadata
-  int64_t* orig_sizes_ptr;
-  int64_t* new_sizes_ptr;
-  int64_t* orig_strides_ptr;
-  int64_t* new_strides_ptr;
-  int32_t orig_dtype, new_dtype;
-  int32_t orig_device_type, new_device_type;
-  int32_t orig_device_index, new_device_index;
-
-  EXPECT_EQ(aoti_torch_get_sizes(orig_tensor, &orig_sizes_ptr), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_sizes(new_tensor, &new_sizes_ptr), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_strides(orig_tensor, &orig_strides_ptr), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_strides(new_tensor, &new_strides_ptr), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_dtype(orig_tensor, &orig_dtype), Error::Ok);
-  EXPECT_EQ(aoti_torch_get_dtype(new_tensor, &new_dtype), Error::Ok);
-  EXPECT_EQ(
-      aoti_torch_get_device_type(orig_tensor, &orig_device_type), Error::Ok);
-  EXPECT_EQ(
-      aoti_torch_get_device_type(new_tensor, &new_device_type), Error::Ok);
-  EXPECT_EQ(
-      aoti_torch_get_device_index(orig_tensor, &orig_device_index), Error::Ok);
-  EXPECT_EQ(
-      aoti_torch_get_device_index(new_tensor, &new_device_index), Error::Ok);
-
-  // Verify all metadata matches
-  for (int i = 0; i < 3; i++) {
-    EXPECT_EQ(orig_sizes_ptr[i], new_sizes_ptr[i]);
-    EXPECT_EQ(orig_strides_ptr[i], new_strides_ptr[i]);
-  }
-  EXPECT_EQ(orig_dtype, new_dtype);
-  EXPECT_EQ(orig_device_type, new_device_type);
-  EXPECT_EQ(orig_device_index, new_device_index);
+// ============================================================================
+// Handle Chain Tests
+// ============================================================================
 
-  // Clean up
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
-}
-
-// Test creating handle chain: orig -> handle1 -> handle2
-TEST_F(AOTITorchNewTensorHandleTest, HandleChain) {
+TEST_F(AOTITorchNewTensorHandleSlimTest, HandleChain_CPU) {
   std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = create_test_tensor(sizes);
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
-  void* orig_ptr = orig_tensor->mutable_data_ptr();
+  void* orig_ptr = orig_tensor->data_ptr();
 
-  // Create first handle
   Tensor* handle1;
   AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &handle1);
   EXPECT_EQ(error, Error::Ok);
   ASSERT_NE(handle1, nullptr);
-  EXPECT_EQ(handle1->mutable_data_ptr(), orig_ptr);
+  EXPECT_EQ(handle1->data_ptr(), orig_ptr);
 
-  // Create second handle from the first handle
   Tensor* handle2;
   error = aoti_torch_new_tensor_handle(handle1, &handle2);
   EXPECT_EQ(error, Error::Ok);
   ASSERT_NE(handle2, nullptr);
-  EXPECT_EQ(handle2->mutable_data_ptr(), orig_ptr);
+  EXPECT_EQ(handle2->data_ptr(), orig_ptr);
 
-  // Delete in reverse order
   EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok);
   EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok);
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
 }
 
-// Test creating handle and verifying reference counting
-TEST_F(AOTITorchNewTensorHandleTest, ReferenceCountingTest) {
+TEST_F(AOTITorchNewTensorHandleSlimTest, ReferenceCountingTest_CPU) {
   std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = create_test_tensor(sizes);
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
-  void* orig_ptr = orig_tensor->mutable_data_ptr();
+  void* orig_ptr = orig_tensor->data_ptr();
 
-  // Create multiple handles
   Tensor* handle1;
   Tensor* handle2;
   Tensor* handle3;
@@ -422,116 +356,276 @@ TEST_F(AOTITorchNewTensorHandleTest, ReferenceCountingTest) {
   EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle2), Error::Ok);
   EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle3), Error::Ok);
 
-  // Delete original
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
 
-  // All handles should still be valid
-  EXPECT_EQ(handle1->mutable_data_ptr(), orig_ptr);
-  EXPECT_EQ(handle2->mutable_data_ptr(), orig_ptr);
-  EXPECT_EQ(handle3->mutable_data_ptr(), orig_ptr);
+  EXPECT_EQ(handle1->data_ptr(), orig_ptr);
+  EXPECT_EQ(handle2->data_ptr(), orig_ptr);
+  EXPECT_EQ(handle3->data_ptr(), orig_ptr);
 
-  // Delete handles one by one
   EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok);
 
-  // Remaining handles should still be valid
-  EXPECT_EQ(handle2->mutable_data_ptr(), orig_ptr);
-  EXPECT_EQ(handle3->mutable_data_ptr(), orig_ptr);
+  EXPECT_EQ(handle2->data_ptr(), orig_ptr);
+  EXPECT_EQ(handle3->data_ptr(), orig_ptr);
 
   EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok);
 
-  // Last handle should still be valid
-  EXPECT_EQ(handle3->mutable_data_ptr(), orig_ptr);
+  EXPECT_EQ(handle3->data_ptr(), orig_ptr);
 
   EXPECT_EQ(aoti_torch_delete_tensor_object(handle3), Error::Ok);
 }
 
-// Test creating handle from int32 tensor
-TEST_F(AOTITorchNewTensorHandleTest, Int32Tensor) {
+// ============================================================================
+// Different Dtype Tests
+// ============================================================================
+
+TEST_F(AOTITorchNewTensorHandleSlimTest, Int64Tensor_CPU) {
   std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = create_test_tensor(
+  Tensor* orig_tensor = createTestTensor(
       sizes,
       {},
-      3, // int32
-      static_cast<int32_t>(SupportedDevices::CUDA));
+      static_cast<int32_t>(slim_c10::ScalarType::Long),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
-  // Create new handle
   Tensor* new_tensor;
   AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
   EXPECT_EQ(error, Error::Ok);
   ASSERT_NE(new_tensor, nullptr);
 
-  // Verify dtype
-  int32_t new_dtype;
-  EXPECT_EQ(aoti_torch_get_dtype(new_tensor, &new_dtype), Error::Ok);
-  EXPECT_EQ(new_dtype, 3); // int32
+  EXPECT_EQ(new_tensor->itemsize(), 8);
 
-  // Clean up
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
   EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
 }
 
-// Test creating handle with incontiguous tensor (transpose-like layout)
-TEST_F(AOTITorchNewTensorHandleTest, IncontiguousTransposeLayout) {
+TEST_F(AOTITorchNewTensorHandleSlimTest, IncontiguousLayout_CPU) {
   std::vector<int64_t> sizes = {3, 4};
   std::vector<int64_t> strides = {1, 3}; // Column-major (incontiguous)
-  Tensor* orig_tensor = create_test_tensor(sizes, strides);
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      strides,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
-  // Create new handle
   Tensor* new_tensor;
   AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
   EXPECT_EQ(error, Error::Ok);
   ASSERT_NE(new_tensor, nullptr);
 
-  // Verify strides are preserved
-  int64_t* new_strides_ptr;
-  EXPECT_EQ(aoti_torch_get_strides(new_tensor, &new_strides_ptr), Error::Ok);
-  EXPECT_EQ(new_strides_ptr[0], 1);
-  EXPECT_EQ(new_strides_ptr[1], 3);
+  EXPECT_EQ(new_tensor->stride(0), 1);
+  EXPECT_EQ(new_tensor->stride(1), 3);
 
-  // Verify both tensors share the same memory
-  EXPECT_EQ(new_tensor->mutable_data_ptr(), orig_tensor->mutable_data_ptr());
+  EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr());
 
-  // Clean up
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
   EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
 }
 
-// Test creating handle with expanded strides (broadcasted dimension)
-TEST_F(AOTITorchNewTensorHandleTest, ExpandedStrides) {
-  std::vector<int64_t> sizes = {2, 3, 4};
-  std::vector<int64_t> strides = {0, 4, 1}; // First dimension has stride 0
-  Tensor* orig_tensor = create_test_tensor(sizes, strides);
+// ============================================================================
+// CUDA Tests
+// ============================================================================
+
+TEST_F(AOTITorchNewTensorHandleSlimTest, BasicFunctionality_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
+  EXPECT_TRUE(orig_tensor->is_cuda());
+
+  Tensor* new_tensor;
+  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(new_tensor, nullptr);
+  EXPECT_TRUE(new_tensor->is_cuda());
+
+  EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr());
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
+}
+
+TEST_F(AOTITorchNewTensorHandleSlimTest, MemorySharing_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  std::vector<int64_t> sizes = {3, 4};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
+
+  void* orig_ptr = orig_tensor->data_ptr();
+  ASSERT_NE(orig_ptr, nullptr);
 
-  // Create new handle
   Tensor* new_tensor;
   AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
   EXPECT_EQ(error, Error::Ok);
   ASSERT_NE(new_tensor, nullptr);
 
-  // Verify expanded strides are preserved
-  int64_t* new_strides_ptr;
-  EXPECT_EQ(aoti_torch_get_strides(new_tensor, &new_strides_ptr), Error::Ok);
-  EXPECT_EQ(new_strides_ptr[0], 0);
-  EXPECT_EQ(new_strides_ptr[1], 4);
-  EXPECT_EQ(new_strides_ptr[2], 1);
+  void* new_ptr = new_tensor->data_ptr();
+  EXPECT_EQ(orig_ptr, new_ptr);
 
-  // Clean up
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+
+  void* still_valid_ptr = new_tensor->data_ptr();
+  EXPECT_EQ(still_valid_ptr, new_ptr);
+
   EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
 }
 
-// Stress test: create many handles
-TEST_F(AOTITorchNewTensorHandleTest, StressTestManyHandles) {
+TEST_F(AOTITorchNewTensorHandleSlimTest, MultipleHandles_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
   std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = create_test_tensor(sizes);
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
+
+  void* orig_ptr = orig_tensor->data_ptr();
+
+  std::vector<Tensor*> handles;
+  const int num_handles = 5;
+
+  for (int i = 0; i < num_handles; i++) {
+    Tensor* new_tensor;
+    AOTITorchError error =
+        aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
+    EXPECT_EQ(error, Error::Ok);
+    ASSERT_NE(new_tensor, nullptr);
+    EXPECT_EQ(new_tensor->data_ptr(), orig_ptr);
+    handles.push_back(new_tensor);
+  }
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+
+  for (Tensor* handle : handles) {
+    EXPECT_EQ(handle->data_ptr(), orig_ptr);
+    EXPECT_TRUE(handle->is_cuda());
+  }
+
+  for (Tensor* handle : handles) {
+    EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok);
+  }
+}
+
+TEST_F(AOTITorchNewTensorHandleSlimTest, ReferenceCountingTest_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(orig_tensor, nullptr);
+
+  void* orig_ptr = orig_tensor->data_ptr();
+
+  Tensor* handle1;
+  Tensor* handle2;
+  Tensor* handle3;
+
+  EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle1), Error::Ok);
+  EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle2), Error::Ok);
+  EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle3), Error::Ok);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
+
+  EXPECT_EQ(handle1->data_ptr(), orig_ptr);
+  EXPECT_EQ(handle2->data_ptr(), orig_ptr);
+  EXPECT_EQ(handle3->data_ptr(), orig_ptr);
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(handle3), Error::Ok);
+}
+
+// ============================================================================
+// Mixed Device Tests
+// ============================================================================
+
+TEST_F(AOTITorchNewTensorHandleSlimTest, MixedDeviceHandles) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  std::vector<int64_t> sizes = {2, 3};
+
+  Tensor* cpu_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
+  ASSERT_NE(cpu_tensor, nullptr);
+  EXPECT_TRUE(cpu_tensor->is_cpu());
+
+  Tensor* cuda_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0);
+  ASSERT_NE(cuda_tensor, nullptr);
+  EXPECT_TRUE(cuda_tensor->is_cuda());
+
+  Tensor* cpu_handle;
+  Tensor* cuda_handle;
+
+  EXPECT_EQ(aoti_torch_new_tensor_handle(cpu_tensor, &cpu_handle), Error::Ok);
+  EXPECT_EQ(aoti_torch_new_tensor_handle(cuda_tensor, &cuda_handle), Error::Ok);
+
+  EXPECT_TRUE(cpu_handle->is_cpu());
+  EXPECT_TRUE(cuda_handle->is_cuda());
+  EXPECT_NE(cpu_handle->data_ptr(), cuda_handle->data_ptr());
+
+  EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_tensor), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_handle), Error::Ok);
+  EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_handle), Error::Ok);
+}
+
+// ============================================================================
+// Stress Tests
+// ============================================================================
+
+TEST_F(AOTITorchNewTensorHandleSlimTest, StressTestManyHandles_CPU) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* orig_tensor = createTestTensor(
+      sizes,
+      {},
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0);
   ASSERT_NE(orig_tensor, nullptr);
 
-  void* orig_ptr = orig_tensor->mutable_data_ptr();
+  void* orig_ptr = orig_tensor->data_ptr();
 
-  // Create many handles
   const int num_handles = 100;
   std::vector<Tensor*> handles;
 
@@ -541,19 +635,16 @@ TEST_F(AOTITorchNewTensorHandleTest, StressTestManyHandles) {
         aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
     EXPECT_EQ(error, Error::Ok);
     ASSERT_NE(new_tensor, nullptr);
-    EXPECT_EQ(new_tensor->mutable_data_ptr(), orig_ptr);
+    EXPECT_EQ(new_tensor->data_ptr(), orig_ptr);
     handles.push_back(new_tensor);
   }
 
-  // Delete original
   EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
 
-  // All handles should still be valid
   for (Tensor* handle : handles) {
-    EXPECT_EQ(handle->mutable_data_ptr(), orig_ptr);
+    EXPECT_EQ(handle->data_ptr(), orig_ptr);
   }
 
-  // Delete all handles
   for (Tensor* handle : handles) {
     EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok);
   }
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle_slim.cpp
deleted file mode 100644
index 3a1de152f0b..00000000000
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle_slim.cpp
+++ /dev/null
@@ -1,651 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <vector>
-
-#include <executorch/backends/aoti/slim/c10/core/Device.h>
-#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
-#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/platform/platform.h>
-
-using namespace executorch::backends::cuda;
-using executorch::runtime::Error;
-
-namespace slim_c10 = executorch::backends::aoti::slim::c10;
-
-namespace {
-
-bool isCudaAvailable() {
-  int device_count = 0;
-  cudaError_t err = cudaGetDeviceCount(&device_count);
-  return (err == cudaSuccess && device_count > 0);
-}
-
-std::vector<int64_t> calculateContiguousStrides(
-    const std::vector<int64_t>& sizes) {
-  std::vector<int64_t> strides(sizes.size());
-  if (sizes.empty()) {
-    return strides;
-  }
-  strides[sizes.size() - 1] = 1;
-  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
-    strides[i] = strides[i + 1] * sizes[i + 1];
-  }
-  return strides;
-}
-
-} // namespace
-
-class AOTITorchNewTensorHandleSlimTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    et_pal_init();
-  }
-
-  void TearDown() override {
-    // SlimTensor uses automatic reference counting - no manual cleanup needed
-  }
-
-  Tensor* createTestTensor(
-      const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides = {},
-      int32_t dtype = static_cast<int32_t>(slim_c10::ScalarType::Float),
-      int32_t device_type = static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      int32_t device_index = 0) {
-    Tensor* tensor = nullptr;
-
-    std::vector<int64_t> effective_strides = strides;
-    if (strides.empty()) {
-      effective_strides = calculateContiguousStrides(sizes);
-    }
-
-    AOTITorchError error = aoti_torch_empty_strided(
-        sizes.size(),
-        sizes.data(),
-        effective_strides.data(),
-        dtype,
-        device_type,
-        device_index,
-        &tensor);
-
-    return (error == Error::Ok) ? tensor : nullptr;
-  }
-};
-
-// ============================================================================
-// Basic Functionality Tests
-// ============================================================================
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, BasicFunctionality_CPU) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(new_tensor, nullptr);
-
-  EXPECT_EQ(new_tensor->dim(), orig_tensor->dim());
-  EXPECT_EQ(new_tensor->size(0), orig_tensor->size(0));
-  EXPECT_EQ(new_tensor->size(1), orig_tensor->size(1));
-  EXPECT_EQ(new_tensor->numel(), orig_tensor->numel());
-
-  EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr());
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, NullOriginalTensor) {
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(nullptr, &new_tensor);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-}
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, NullNewHandle) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, nullptr);
-
-  EXPECT_EQ(error, Error::InvalidArgument);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-}
-
-// ============================================================================
-// Memory Sharing Tests
-// ============================================================================
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, MemorySharing_CPU) {
-  std::vector<int64_t> sizes = {3, 4};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-  ASSERT_NE(orig_ptr, nullptr);
-
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(new_tensor, nullptr);
-
-  void* new_ptr = new_tensor->data_ptr();
-  EXPECT_EQ(orig_ptr, new_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-
-  void* still_valid_ptr = new_tensor->data_ptr();
-  EXPECT_EQ(still_valid_ptr, new_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, MultipleHandles_CPU) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-
-  std::vector<Tensor*> handles;
-  const int num_handles = 5;
-
-  for (int i = 0; i < num_handles; i++) {
-    Tensor* new_tensor;
-    AOTITorchError error =
-        aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-    EXPECT_EQ(error, Error::Ok);
-    ASSERT_NE(new_tensor, nullptr);
-    EXPECT_EQ(new_tensor->data_ptr(), orig_ptr);
-    handles.push_back(new_tensor);
-  }
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-
-  for (Tensor* handle : handles) {
-    EXPECT_EQ(handle->data_ptr(), orig_ptr);
-    EXPECT_EQ(handle->dim(), 2);
-    EXPECT_EQ(handle->size(0), 2);
-    EXPECT_EQ(handle->size(1), 3);
-  }
-
-  for (Tensor* handle : handles) {
-    EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok);
-  }
-}
-
-// ============================================================================
-// Tensor Property Tests
-// ============================================================================
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, CustomStrides_CPU) {
-  std::vector<int64_t> sizes = {3, 4};
-  std::vector<int64_t> strides = {4, 1}; // Row-major strides
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      strides,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(new_tensor, nullptr);
-
-  EXPECT_EQ(orig_tensor->stride(0), new_tensor->stride(0));
-  EXPECT_EQ(orig_tensor->stride(1), new_tensor->stride(1));
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, BFloat16Tensor_CPU) {
-  std::vector<int64_t> sizes = {2, 3, 4};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::BFloat16),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(new_tensor, nullptr);
-
-  EXPECT_EQ(new_tensor->itemsize(), 2);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, ScalarTensor_CPU) {
-  std::vector<int64_t> sizes = {};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-  EXPECT_EQ(orig_tensor->dim(), 0);
-
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(new_tensor, nullptr);
-
-  EXPECT_EQ(new_tensor->dim(), 0);
-  EXPECT_EQ(new_tensor->numel(), 1);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, LargeMultiDimensionalTensor_CPU) {
-  std::vector<int64_t> sizes = {10, 20, 30};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(new_tensor, nullptr);
-
-  EXPECT_EQ(new_tensor->dim(), 3);
-  EXPECT_EQ(new_tensor->size(0), 10);
-  EXPECT_EQ(new_tensor->size(1), 20);
-  EXPECT_EQ(new_tensor->size(2), 30);
-  EXPECT_EQ(new_tensor->numel(), 6000);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
-}
-
-// ============================================================================
-// Handle Chain Tests
-// ============================================================================
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, HandleChain_CPU) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-
-  Tensor* handle1;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &handle1);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(handle1, nullptr);
-  EXPECT_EQ(handle1->data_ptr(), orig_ptr);
-
-  Tensor* handle2;
-  error = aoti_torch_new_tensor_handle(handle1, &handle2);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(handle2, nullptr);
-  EXPECT_EQ(handle2->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, ReferenceCountingTest_CPU) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-
-  Tensor* handle1;
-  Tensor* handle2;
-  Tensor* handle3;
-
-  EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle1), Error::Ok);
-  EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle2), Error::Ok);
-  EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle3), Error::Ok);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-
-  EXPECT_EQ(handle1->data_ptr(), orig_ptr);
-  EXPECT_EQ(handle2->data_ptr(), orig_ptr);
-  EXPECT_EQ(handle3->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok);
-
-  EXPECT_EQ(handle2->data_ptr(), orig_ptr);
-  EXPECT_EQ(handle3->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok);
-
-  EXPECT_EQ(handle3->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(handle3), Error::Ok);
-}
-
-// ============================================================================
-// Different Dtype Tests
-// ============================================================================
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, Int64Tensor_CPU) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Long),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(new_tensor, nullptr);
-
-  EXPECT_EQ(new_tensor->itemsize(), 8);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, IncontiguousLayout_CPU) {
-  std::vector<int64_t> sizes = {3, 4};
-  std::vector<int64_t> strides = {1, 3}; // Column-major (incontiguous)
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      strides,
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(new_tensor, nullptr);
-
-  EXPECT_EQ(new_tensor->stride(0), 1);
-  EXPECT_EQ(new_tensor->stride(1), 3);
-
-  EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr());
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
-}
-
-// ============================================================================
-// CUDA Tests
-// ============================================================================
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, BasicFunctionality_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-  EXPECT_TRUE(orig_tensor->is_cuda());
-
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-
-  EXPECT_EQ(error, Error::Ok);
-  EXPECT_NE(new_tensor, nullptr);
-  EXPECT_TRUE(new_tensor->is_cuda());
-
-  EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr());
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, MemorySharing_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {3, 4};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-  ASSERT_NE(orig_ptr, nullptr);
-
-  Tensor* new_tensor;
-  AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-  EXPECT_EQ(error, Error::Ok);
-  ASSERT_NE(new_tensor, nullptr);
-
-  void* new_ptr = new_tensor->data_ptr();
-  EXPECT_EQ(orig_ptr, new_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-
-  void* still_valid_ptr = new_tensor->data_ptr();
-  EXPECT_EQ(still_valid_ptr, new_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok);
-}
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, MultipleHandles_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-
-  std::vector<Tensor*> handles;
-  const int num_handles = 5;
-
-  for (int i = 0; i < num_handles; i++) {
-    Tensor* new_tensor;
-    AOTITorchError error =
-        aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-    EXPECT_EQ(error, Error::Ok);
-    ASSERT_NE(new_tensor, nullptr);
-    EXPECT_EQ(new_tensor->data_ptr(), orig_ptr);
-    handles.push_back(new_tensor);
-  }
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-
-  for (Tensor* handle : handles) {
-    EXPECT_EQ(handle->data_ptr(), orig_ptr);
-    EXPECT_TRUE(handle->is_cuda());
-  }
-
-  for (Tensor* handle : handles) {
-    EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok);
-  }
-}
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, ReferenceCountingTest_CUDA) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-
-  Tensor* handle1;
-  Tensor* handle2;
-  Tensor* handle3;
-
-  EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle1), Error::Ok);
-  EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle2), Error::Ok);
-  EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle3), Error::Ok);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-
-  EXPECT_EQ(handle1->data_ptr(), orig_ptr);
-  EXPECT_EQ(handle2->data_ptr(), orig_ptr);
-  EXPECT_EQ(handle3->data_ptr(), orig_ptr);
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(handle3), Error::Ok);
-}
-
-// ============================================================================
-// Mixed Device Tests
-// ============================================================================
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, MixedDeviceHandles) {
-  if (!isCudaAvailable()) {
-    GTEST_SKIP() << "CUDA not available";
-  }
-
-  std::vector<int64_t> sizes = {2, 3};
-
-  Tensor* cpu_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(cpu_tensor, nullptr);
-  EXPECT_TRUE(cpu_tensor->is_cpu());
-
-  Tensor* cuda_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
-      0);
-  ASSERT_NE(cuda_tensor, nullptr);
-  EXPECT_TRUE(cuda_tensor->is_cuda());
-
-  Tensor* cpu_handle;
-  Tensor* cuda_handle;
-
-  EXPECT_EQ(aoti_torch_new_tensor_handle(cpu_tensor, &cpu_handle), Error::Ok);
-  EXPECT_EQ(aoti_torch_new_tensor_handle(cuda_tensor, &cuda_handle), Error::Ok);
-
-  EXPECT_TRUE(cpu_handle->is_cpu());
-  EXPECT_TRUE(cuda_handle->is_cuda());
-  EXPECT_NE(cpu_handle->data_ptr(), cuda_handle->data_ptr());
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_tensor), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_handle), Error::Ok);
-  EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_handle), Error::Ok);
-}
-
-// ============================================================================
-// Stress Tests
-// ============================================================================
-
-TEST_F(AOTITorchNewTensorHandleSlimTest, StressTestManyHandles_CPU) {
-  std::vector<int64_t> sizes = {2, 3};
-  Tensor* orig_tensor = createTestTensor(
-      sizes,
-      {},
-      static_cast<int32_t>(slim_c10::ScalarType::Float),
-      static_cast<int32_t>(slim_c10::DeviceType::CPU),
-      0);
-  ASSERT_NE(orig_tensor, nullptr);
-
-  void* orig_ptr = orig_tensor->data_ptr();
-
-  const int num_handles = 100;
-  std::vector<Tensor*> handles;
-
-  for (int i = 0; i < num_handles; i++) {
-    Tensor* new_tensor;
-    AOTITorchError error =
-        aoti_torch_new_tensor_handle(orig_tensor, &new_tensor);
-    EXPECT_EQ(error, Error::Ok);
-    ASSERT_NE(new_tensor, nullptr);
-    EXPECT_EQ(new_tensor->data_ptr(), orig_ptr);
-    handles.push_back(new_tensor);
-  }
-
-  EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok);
-
-  for (Tensor* handle : handles) {
-    EXPECT_EQ(handle->data_ptr(), orig_ptr);
-  }
-
-  for (Tensor* handle : handles) {
-    EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok);
-  }
-}
diff --git a/backends/cuda/runtime/utils.h b/backends/cuda/runtime/utils.h
index 4474f8cf57e..8517ec21af6 100644
--- a/backends/cuda/runtime/utils.h
+++ b/backends/cuda/runtime/utils.h
@@ -31,6 +31,7 @@
   } while (0)
 
 // CUDA error checking macro (without return, for use in void functions)
+#ifndef ET_CUDA_CHECK
 #define ET_CUDA_CHECK(EXPR)                                         \
   do {                                                              \
     const cudaError_t err = EXPR;                                   \
@@ -45,6 +46,7 @@
         cudaGetErrorString(err));                                   \
     ET_CHECK_MSG(false, "CUDA error: %s", cudaGetErrorString(err)); \
   } while (0)
+#endif
 
 // Kernel launch check macro (with return)
 #define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \