diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index d5582dfe7c7..121f4b60418 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -25,34 +25,25 @@ endif() include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) find_package_torch() -# Common AOTI functionality - combines all AOTI common components -set(_aoti_common_sources common_shims.cpp) -add_library(aoti_common STATIC ${_aoti_common_sources}) +# Common AOTI functionality - header-only library for common shims +add_library(aoti_common INTERFACE) target_include_directories( aoti_common - PUBLIC $ $ - $ + INTERFACE $ + $ + $ ) target_compile_options( aoti_common - PUBLIC $<$:/EHsc /GR> - $<$>:-fexceptions -frtti -fPIC> + INTERFACE $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> ) target_compile_definitions( - aoti_common PRIVATE $<$:EXPORT_AOTI_FUNCTIONS> + aoti_common INTERFACE $<$:EXPORT_AOTI_FUNCTIONS> ) -# Ensure symbols are exported properly -if(APPLE) - target_link_options(aoti_common PUBLIC -Wl,-export_dynamic) -else() - target_link_options( - aoti_common PUBLIC $<$>:-Wl,--export-dynamic> - ) -endif() # Link against ExecuTorch libraries and standard libraries -target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS}) -executorch_target_link_options_shared_lib(aoti_common) +target_link_libraries(aoti_common INTERFACE extension_tensor ${CMAKE_DL_LIBS}) install( TARGETS aoti_common diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h index b14e02da9ef..7447292e5d9 100644 --- a/backends/aoti/aoti_delegate_handle.h +++ b/backends/aoti/aoti_delegate_handle.h @@ -11,6 +11,11 @@ #include #include #include +#include + +#ifdef CUDA_AVAILABLE +#include +#endif namespace executorch { namespace backends { @@ -95,6 +100,7 @@ struct AOTIDelegateHandle { AOTInductorModelContainerGetNumOutputsFunc get_num_outputs; AOTInductorModelContainerRunFunc run; AOTInductorModelUpdateConstantsFromBlobFunc update_constants_from_blob; + }; } // namespace aoti diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp deleted file mode 100644 index abfde86db6d..00000000000 --- a/backends/aoti/common_shims.cpp +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -namespace executorch { -namespace backends { -namespace aoti { - -namespace internal { -// Global storage for tensor metadata -AOTI_SHIM_EXPORT std::unordered_map> - tensor_to_sizes; -AOTI_SHIM_EXPORT std::unordered_map> - tensor_to_strides; -} // namespace internal - -extern "C" { - -// Autograd mode functions -int32_t aoti_torch_grad_mode_is_enabled() { - // No autograd ever - return false; -} - -void aoti_torch_grad_mode_set_enabled(bool enabled) { - if (enabled) { - throw std::runtime_error("Cannot enable autograd"); - } -} - -// Tensor attribute operations -AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr) { - *ret_data_ptr = tensor->mutable_data_ptr(); - return Error::Ok; -} - -AOTITorchError aoti_torch_get_storage_offset( - Tensor* tensor, - int64_t* ret_storage_offset) { - // Storage offset is always 0 in ET - *ret_storage_offset = 0; - - return Error::Ok; -} - -AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) { - auto it = internal::tensor_to_strides.find(tensor); - bool needs_update = false; - - if (it == internal::tensor_to_strides.end()) { - needs_update = true; - } else { - // CRITICAL: Multimodal models reuse tensors with different shapes across - // executions (e.g., variable-length audio). We MUST validate cached - // metadata matches current tensor state, or CUDA kernels will receive - // incorrect shapes leading to memory corruption and segfaults. - auto tensor_strides = tensor->strides(); - needs_update = !std::equal( - it->second.begin(), - it->second.end(), - tensor_strides.begin(), - tensor_strides.end()); - } - - if (needs_update) { - std::vector strides(tensor->dim()); - auto tensor_strides = tensor->strides(); - for (int i = 0; i < tensor->dim(); i++) { - strides[i] = tensor_strides[i]; - } - it = - internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides)) - .first; - } - - // For 0D tensors, data() returns nullptr on empty vectors, but we need to - // return a valid pointer - if (it->second.empty()) { - static int64_t empty_strides_placeholder = 0; - *ret_strides = &empty_strides_placeholder; - } else { - *ret_strides = it->second.data(); - } - - return Error::Ok; -} - -AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) { - *ret_dtype = static_cast(tensor->scalar_type()); - - return Error::Ok; -} - -AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) { - auto it = internal::tensor_to_sizes.find(tensor); - bool needs_update = false; - - if (it == internal::tensor_to_sizes.end()) { - needs_update = true; - } else { - // CRITICAL: Multimodal models reuse tensors with different shapes across - // executions (e.g., variable-length audio). We MUST validate cached - // metadata matches current tensor state, or CUDA kernels will receive - // incorrect shapes leading to memory corruption and segfaults. - auto tensor_sizes = tensor->sizes(); - needs_update = !std::equal( - it->second.begin(), - it->second.end(), - tensor_sizes.begin(), - tensor_sizes.end()); - } - - if (needs_update) { - std::vector sizes(tensor->dim()); - auto tensor_sizes = tensor->sizes(); - for (int i = 0; i < tensor->dim(); i++) { - sizes[i] = tensor_sizes[i]; - } - it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)) - .first; - } - - // For 0D tensors, data() returns nullptr on empty vectors, but we need to - // return a valid pointer - if (it->second.empty()) { - static int64_t empty_sizes_placeholder = 0; - *ret_sizes = &empty_sizes_placeholder; - } else { - *ret_sizes = it->second.data(); - } - - return Error::Ok; -} - -AOTITorchError aoti_torch_get_device_index( - Tensor* tensor, - int32_t* ret_device_index) { - // Let's assume all tensors AOTI using are on CUDA:0 - *ret_device_index = 0; - return Error::Ok; -} - -AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim) { - *ret_dim = static_cast(tensor->dim()); - return Error::Ok; -} - -// Device and layout utility functions -int32_t aoti_torch_device_type_cpu() { - // Let's say cpu is 0 for ET as well - return 0; -} - -int32_t aoti_torch_layout_strided() { - // ET only support strided layout, the return value will always be 0, a.k.a - // at::Layout::Strided; - return 0; -} - -// Dtype constants - these return the PyTorch dtype codes -int32_t aoti_torch_dtype_float32() { - return 6; // PyTorch's float32 dtype code -} - -int32_t aoti_torch_dtype_bfloat16() { - return 15; // PyTorch's bfloat16 dtype code -} - -int32_t aoti_torch_dtype_int8() { - return 1; // PyTorch's int32 dtype code -} - -int32_t aoti_torch_dtype_int16() { - return 2; // PyTorch's int32 dtype code -} - -int32_t aoti_torch_dtype_int32() { - return 3; // PyTorch's int32 dtype code -} - -int32_t aoti_torch_dtype_bool() { - return 11; // PyTorch's bool dtype code -} - -int32_t aoti_torch_dtype_int64() { - return 4; // PyTorch's int64 dtype code -} - -// Dtype utility function needed by Metal backend. -// Returns the size of the dtype in bytes. -size_t aoti_torch_dtype_element_size(int32_t dtype) { - return dtype_to_element_size(dtype); -} - -// Cleanup functions -void cleanup_tensor_metadata() { - internal::tensor_to_sizes.clear(); - internal::tensor_to_strides.clear(); -} - -AOTI_SHIM_EXPORT void aoti_torch_warn( - const char* func, - const char* file, - uint32_t line, - const char* msg) { - ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg); -} - -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) { - (void)tensor; - (void)ret_size; - throw std::runtime_error("Not implemented"); - return Error::Internal; -} - -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor) { - (void)self; - (void)ret_new_tensor; - throw std::runtime_error("Not implemented"); - return Error::Internal; -} - -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) { - (void)self; - (void)ret_new_tensor; - throw std::runtime_error("Not implemented"); - return Error::Internal; -} - -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( - void* data_ptr, - int64_t ndim, - const int64_t* sizes, - const int64_t* strides, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, - int32_t device_index, - Tensor** ret_new_tensor) { - (void)data_ptr; - (void)ndim; - (void)sizes; - (void)strides; - (void)storage_offset; - (void)dtype; - (void)device_type; - (void)device_index; - (void)ret_new_tensor; - throw std::runtime_error("Not implemented"); - return Error::Internal; -} - -} // extern "C" - -} // namespace aoti -} // namespace backends -} // namespace executorch diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index 3fc414fb669..dfcdecd2bc2 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -9,104 +9,343 @@ #pragma once #include -#include #include -#include #include #include #include +// Uses conditional compilation to separate the implementation between +// CUDA backend (SlimTensor) and other backends like MPS (ETensor). +// The caller determines which path is used by defining CUDA_AVAILABLE. +#ifdef CUDA_AVAILABLE +#include +#else +#include +#endif + namespace executorch { namespace backends { namespace aoti { // Common using declarations for ExecuTorch types using executorch::runtime::Error; -using executorch::runtime::etensor::Tensor; - -// Global storage for tensor metadata -extern std::unordered_map> tensor_to_sizes; -extern std::unordered_map> tensor_to_strides; -extern "C" { +// ============================================================ +// Tensor Type Definition - branched based on CUDA_AVAILABLE +// ============================================================ +#ifdef CUDA_AVAILABLE +using Tensor = executorch::backends::aoti::slim::SlimTensor; +#else +using Tensor = executorch::runtime::etensor::Tensor; +#endif // Common AOTI type aliases using AOTIRuntimeError = Error; using AOTITorchError = Error; -// Attribute-related operations (memory-irrelevant) -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr); +#ifndef CUDA_AVAILABLE +namespace internal { +// Global storage for tensor metadata (ETensor path only) +// SlimTensor stores sizes/strides directly in int64_t[] - no caching needed +inline std::unordered_map>& tensor_to_sizes() { + static std::unordered_map> instance; + return instance; +} +inline std::unordered_map>& tensor_to_strides() { + static std::unordered_map> instance; + return instance; +} +} // namespace internal +#endif + +// ============================================================ +// Basic Property Getters - Inline implementations +// ============================================================ + +inline AOTITorchError aoti_torch_get_data_ptr( + Tensor* tensor, + void** ret_data_ptr) { + if (tensor == nullptr) { + return Error::InvalidArgument; + } + if (ret_data_ptr == nullptr) { + return Error::InvalidArgument; + } + +#ifdef CUDA_AVAILABLE + *ret_data_ptr = tensor->data_ptr(); +#else + *ret_data_ptr = tensor->mutable_data_ptr(); +#endif + return Error::Ok; +} + +inline AOTITorchError aoti_torch_get_sizes( + Tensor* tensor, + int64_t** ret_sizes) { + if (tensor == nullptr) { + return Error::InvalidArgument; + } + if (ret_sizes == nullptr) { + return Error::InvalidArgument; + } + +#ifdef CUDA_AVAILABLE + // SlimTensor stores sizes directly in int64_t[] - no caching needed + *ret_sizes = const_cast(tensor->sizes().data()); +#else + auto it = internal::tensor_to_sizes().find(tensor); + bool needs_update = false; + + if (it == internal::tensor_to_sizes().end()) { + needs_update = true; + } else { + // Validate cached metadata matches current tensor state + auto tensor_sizes = tensor->sizes(); + needs_update = !std::equal( + it->second.begin(), + it->second.end(), + tensor_sizes.begin(), + tensor_sizes.end()); + } + + if (needs_update) { + std::vector sizes(tensor->dim()); + auto tensor_sizes = tensor->sizes(); + for (int i = 0; i < tensor->dim(); i++) { + sizes[i] = tensor_sizes[i]; + } + it = internal::tensor_to_sizes() + .insert_or_assign(tensor, std::move(sizes)) + .first; + } + + // For 0D tensors, data() returns nullptr on empty vectors + if (it->second.empty()) { + static int64_t empty_sizes_placeholder = 0; + *ret_sizes = &empty_sizes_placeholder; + } else { + *ret_sizes = it->second.data(); + } +#endif + return Error::Ok; +} + +inline AOTITorchError aoti_torch_get_strides( + Tensor* tensor, + int64_t** ret_strides) { + if (tensor == nullptr) { + return Error::InvalidArgument; + } + if (ret_strides == nullptr) { + return Error::InvalidArgument; + } + +#ifdef CUDA_AVAILABLE + // SlimTensor stores strides directly in int64_t[] - no caching needed + *ret_strides = const_cast(tensor->strides().data()); +#else + auto it = internal::tensor_to_strides().find(tensor); + bool needs_update = false; + + if (it == internal::tensor_to_strides().end()) { + needs_update = true; + } else { + // Validate cached metadata matches current tensor state + auto tensor_strides = tensor->strides(); + needs_update = !std::equal( + it->second.begin(), + it->second.end(), + tensor_strides.begin(), + tensor_strides.end()); + } + + if (needs_update) { + std::vector strides(tensor->dim()); + auto tensor_strides = tensor->strides(); + for (int i = 0; i < tensor->dim(); i++) { + strides[i] = tensor_strides[i]; + } + it = internal::tensor_to_strides() + .insert_or_assign(tensor, std::move(strides)) + .first; + } + + // For 0D tensors, data() returns nullptr on empty vectors + if (it->second.empty()) { + static int64_t empty_strides_placeholder = 0; + *ret_strides = &empty_strides_placeholder; + } else { + *ret_strides = it->second.data(); + } +#endif + return Error::Ok; +} + +inline AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) { + if (tensor == nullptr) { + return Error::InvalidArgument; + } + if (ret_dtype == nullptr) { + return Error::InvalidArgument; + } + +#ifdef CUDA_AVAILABLE + *ret_dtype = static_cast(tensor->dtype()); +#else + *ret_dtype = static_cast(tensor->scalar_type()); +#endif + return Error::Ok; +} + +inline AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim) { + if (tensor == nullptr) { + return Error::InvalidArgument; + } + if (ret_dim == nullptr) { + return Error::InvalidArgument; + } + + *ret_dim = static_cast(tensor->dim()); + return Error::Ok; +} + +// ============================================================ +// Storage & Device Property Getters - Inline implementations +// ============================================================ + +inline AOTITorchError aoti_torch_get_storage_offset( + Tensor* tensor, + int64_t* ret_storage_offset) { + if (tensor == nullptr) { + return Error::InvalidArgument; + } + if (ret_storage_offset == nullptr) { + return Error::InvalidArgument; + } + +#ifdef CUDA_AVAILABLE + // SlimTensor supports real storage offset + *ret_storage_offset = tensor->storage_offset(); +#else + // ETensor doesn't support storage_offset, return 0 + *ret_storage_offset = 0; +#endif + return Error::Ok; +} + +inline AOTITorchError aoti_torch_get_storage_size( + Tensor* tensor, + int64_t* ret_size) { + if (tensor == nullptr) { + return Error::InvalidArgument; + } + if (ret_size == nullptr) { + return Error::InvalidArgument; + } + + *ret_size = static_cast(tensor->nbytes()); + return Error::Ok; +} + +inline AOTITorchError aoti_torch_get_device_type( + Tensor* tensor, + int32_t* ret_device_type) { + if (tensor == nullptr) { + return Error::InvalidArgument; + } + if (ret_device_type == nullptr) { + return Error::InvalidArgument; + } -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_get_storage_offset(Tensor* tensor, int64_t* ret_storage_offset); +#ifdef CUDA_AVAILABLE + // SlimTensor supports real device type + *ret_device_type = static_cast(tensor->device_type()); +#else + // ETensor is always CPU in default mode + *ret_device_type = 0; // CPU +#endif + return Error::Ok; +} -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides); +inline AOTITorchError aoti_torch_get_device_index( + Tensor* tensor, + int32_t* ret_device_index) { + if (tensor == nullptr) { + return Error::InvalidArgument; + } + if (ret_device_index == nullptr) { + return Error::InvalidArgument; + } -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype); +#ifdef CUDA_AVAILABLE + // SlimTensor supports real device index + *ret_device_index = static_cast(tensor->device_index()); +#else + // ETensor doesn't support multi-device, return 0 + *ret_device_index = 0; +#endif + return Error::Ok; +} -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes); +// ============================================================ +// DType Constants - These return PyTorch ScalarType enum values +// ============================================================ -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); +inline int32_t aoti_torch_dtype_float32() { + return 6; // ScalarType::Float +} -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index); +inline int32_t aoti_torch_dtype_bfloat16() { + return 15; // ScalarType::BFloat16 +} -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); +inline int32_t aoti_torch_dtype_int64() { + return 4; // ScalarType::Long +} -// Utility functions for device and layout information -AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu(); -AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided(); -AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32(); -AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16(); -AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bool(); -AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8(); -AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16(); -AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32(); -AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64(); +inline int32_t aoti_torch_dtype_int32() { + return 3; // ScalarType::Int +} -// Dtype utility function needed by Metal backend -AOTI_SHIM_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype); +inline int32_t aoti_torch_dtype_int16() { + return 2; // ScalarType::Short +} -// Autograd mode functions -AOTI_SHIM_EXPORT int32_t aoti_torch_grad_mode_is_enabled(); -AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled); +inline int32_t aoti_torch_dtype_int8() { + return 1; // ScalarType::Char +} -// Cleanup functions for clearing global state -AOTI_SHIM_EXPORT void cleanup_tensor_metadata(); +inline int32_t aoti_torch_dtype_bool() { + return 11; // ScalarType::Bool +} -AOTI_SHIM_EXPORT void aoti_torch_warn( - const char* func, - const char* file, - uint32_t line, - const char* msg); +// ============================================================ +// Device Type Constants +// ============================================================ -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); +inline int32_t aoti_torch_device_type_cpu() { + return 0; // DeviceType::CPU +} -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor); +inline int32_t aoti_torch_device_type_cuda() { + return 1; // DeviceType::CUDA +} -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor); +// ============================================================ +// Grad Mode Functions (not supported in ExecuTorch) +// ============================================================ -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( - void* data_ptr, - int64_t ndim, - const int64_t* sizes, - const int64_t* strides, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, - int32_t device_index, - Tensor** ret_new_tensor); +inline bool aoti_torch_grad_mode_is_enabled() { + return false; // ExecuTorch doesn't support autograd +} -} // extern "C" +inline AOTITorchError aoti_torch_grad_mode_set_enabled(bool enabled) { + if (enabled) { + return Error::NotSupported; // Grad mode not supported in ExecuTorch + } + return Error::Ok; +} } // namespace aoti } // namespace backends diff --git a/backends/aoti/common_shims_slim.h b/backends/aoti/common_shims_slim.h deleted file mode 100644 index dfcdecd2bc2..00000000000 --- a/backends/aoti/common_shims_slim.h +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include -#include - -// Uses conditional compilation to separate the implementation between -// CUDA backend (SlimTensor) and other backends like MPS (ETensor). -// The caller determines which path is used by defining CUDA_AVAILABLE. -#ifdef CUDA_AVAILABLE -#include -#else -#include -#endif - -namespace executorch { -namespace backends { -namespace aoti { - -// Common using declarations for ExecuTorch types -using executorch::runtime::Error; - -// ============================================================ -// Tensor Type Definition - branched based on CUDA_AVAILABLE -// ============================================================ -#ifdef CUDA_AVAILABLE -using Tensor = executorch::backends::aoti::slim::SlimTensor; -#else -using Tensor = executorch::runtime::etensor::Tensor; -#endif - -// Common AOTI type aliases -using AOTIRuntimeError = Error; -using AOTITorchError = Error; - -#ifndef CUDA_AVAILABLE -namespace internal { -// Global storage for tensor metadata (ETensor path only) -// SlimTensor stores sizes/strides directly in int64_t[] - no caching needed -inline std::unordered_map>& tensor_to_sizes() { - static std::unordered_map> instance; - return instance; -} -inline std::unordered_map>& tensor_to_strides() { - static std::unordered_map> instance; - return instance; -} -} // namespace internal -#endif - -// ============================================================ -// Basic Property Getters - Inline implementations -// ============================================================ - -inline AOTITorchError aoti_torch_get_data_ptr( - Tensor* tensor, - void** ret_data_ptr) { - if (tensor == nullptr) { - return Error::InvalidArgument; - } - if (ret_data_ptr == nullptr) { - return Error::InvalidArgument; - } - -#ifdef CUDA_AVAILABLE - *ret_data_ptr = tensor->data_ptr(); -#else - *ret_data_ptr = tensor->mutable_data_ptr(); -#endif - return Error::Ok; -} - -inline AOTITorchError aoti_torch_get_sizes( - Tensor* tensor, - int64_t** ret_sizes) { - if (tensor == nullptr) { - return Error::InvalidArgument; - } - if (ret_sizes == nullptr) { - return Error::InvalidArgument; - } - -#ifdef CUDA_AVAILABLE - // SlimTensor stores sizes directly in int64_t[] - no caching needed - *ret_sizes = const_cast(tensor->sizes().data()); -#else - auto it = internal::tensor_to_sizes().find(tensor); - bool needs_update = false; - - if (it == internal::tensor_to_sizes().end()) { - needs_update = true; - } else { - // Validate cached metadata matches current tensor state - auto tensor_sizes = tensor->sizes(); - needs_update = !std::equal( - it->second.begin(), - it->second.end(), - tensor_sizes.begin(), - tensor_sizes.end()); - } - - if (needs_update) { - std::vector sizes(tensor->dim()); - auto tensor_sizes = tensor->sizes(); - for (int i = 0; i < tensor->dim(); i++) { - sizes[i] = tensor_sizes[i]; - } - it = internal::tensor_to_sizes() - .insert_or_assign(tensor, std::move(sizes)) - .first; - } - - // For 0D tensors, data() returns nullptr on empty vectors - if (it->second.empty()) { - static int64_t empty_sizes_placeholder = 0; - *ret_sizes = &empty_sizes_placeholder; - } else { - *ret_sizes = it->second.data(); - } -#endif - return Error::Ok; -} - -inline AOTITorchError aoti_torch_get_strides( - Tensor* tensor, - int64_t** ret_strides) { - if (tensor == nullptr) { - return Error::InvalidArgument; - } - if (ret_strides == nullptr) { - return Error::InvalidArgument; - } - -#ifdef CUDA_AVAILABLE - // SlimTensor stores strides directly in int64_t[] - no caching needed - *ret_strides = const_cast(tensor->strides().data()); -#else - auto it = internal::tensor_to_strides().find(tensor); - bool needs_update = false; - - if (it == internal::tensor_to_strides().end()) { - needs_update = true; - } else { - // Validate cached metadata matches current tensor state - auto tensor_strides = tensor->strides(); - needs_update = !std::equal( - it->second.begin(), - it->second.end(), - tensor_strides.begin(), - tensor_strides.end()); - } - - if (needs_update) { - std::vector strides(tensor->dim()); - auto tensor_strides = tensor->strides(); - for (int i = 0; i < tensor->dim(); i++) { - strides[i] = tensor_strides[i]; - } - it = internal::tensor_to_strides() - .insert_or_assign(tensor, std::move(strides)) - .first; - } - - // For 0D tensors, data() returns nullptr on empty vectors - if (it->second.empty()) { - static int64_t empty_strides_placeholder = 0; - *ret_strides = &empty_strides_placeholder; - } else { - *ret_strides = it->second.data(); - } -#endif - return Error::Ok; -} - -inline AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) { - if (tensor == nullptr) { - return Error::InvalidArgument; - } - if (ret_dtype == nullptr) { - return Error::InvalidArgument; - } - -#ifdef CUDA_AVAILABLE - *ret_dtype = static_cast(tensor->dtype()); -#else - *ret_dtype = static_cast(tensor->scalar_type()); -#endif - return Error::Ok; -} - -inline AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim) { - if (tensor == nullptr) { - return Error::InvalidArgument; - } - if (ret_dim == nullptr) { - return Error::InvalidArgument; - } - - *ret_dim = static_cast(tensor->dim()); - return Error::Ok; -} - -// ============================================================ -// Storage & Device Property Getters - Inline implementations -// ============================================================ - -inline AOTITorchError aoti_torch_get_storage_offset( - Tensor* tensor, - int64_t* ret_storage_offset) { - if (tensor == nullptr) { - return Error::InvalidArgument; - } - if (ret_storage_offset == nullptr) { - return Error::InvalidArgument; - } - -#ifdef CUDA_AVAILABLE - // SlimTensor supports real storage offset - *ret_storage_offset = tensor->storage_offset(); -#else - // ETensor doesn't support storage_offset, return 0 - *ret_storage_offset = 0; -#endif - return Error::Ok; -} - -inline AOTITorchError aoti_torch_get_storage_size( - Tensor* tensor, - int64_t* ret_size) { - if (tensor == nullptr) { - return Error::InvalidArgument; - } - if (ret_size == nullptr) { - return Error::InvalidArgument; - } - - *ret_size = static_cast(tensor->nbytes()); - return Error::Ok; -} - -inline AOTITorchError aoti_torch_get_device_type( - Tensor* tensor, - int32_t* ret_device_type) { - if (tensor == nullptr) { - return Error::InvalidArgument; - } - if (ret_device_type == nullptr) { - return Error::InvalidArgument; - } - -#ifdef CUDA_AVAILABLE - // SlimTensor supports real device type - *ret_device_type = static_cast(tensor->device_type()); -#else - // ETensor is always CPU in default mode - *ret_device_type = 0; // CPU -#endif - return Error::Ok; -} - -inline AOTITorchError aoti_torch_get_device_index( - Tensor* tensor, - int32_t* ret_device_index) { - if (tensor == nullptr) { - return Error::InvalidArgument; - } - if (ret_device_index == nullptr) { - return Error::InvalidArgument; - } - -#ifdef CUDA_AVAILABLE - // SlimTensor supports real device index - *ret_device_index = static_cast(tensor->device_index()); -#else - // ETensor doesn't support multi-device, return 0 - *ret_device_index = 0; -#endif - return Error::Ok; -} - -// ============================================================ -// DType Constants - These return PyTorch ScalarType enum values -// ============================================================ - -inline int32_t aoti_torch_dtype_float32() { - return 6; // ScalarType::Float -} - -inline int32_t aoti_torch_dtype_bfloat16() { - return 15; // ScalarType::BFloat16 -} - -inline int32_t aoti_torch_dtype_int64() { - return 4; // ScalarType::Long -} - -inline int32_t aoti_torch_dtype_int32() { - return 3; // ScalarType::Int -} - -inline int32_t aoti_torch_dtype_int16() { - return 2; // ScalarType::Short -} - -inline int32_t aoti_torch_dtype_int8() { - return 1; // ScalarType::Char -} - -inline int32_t aoti_torch_dtype_bool() { - return 11; // ScalarType::Bool -} - -// ============================================================ -// Device Type Constants -// ============================================================ - -inline int32_t aoti_torch_device_type_cpu() { - return 0; // DeviceType::CPU -} - -inline int32_t aoti_torch_device_type_cuda() { - return 1; // DeviceType::CUDA -} - -// ============================================================ -// Grad Mode Functions (not supported in ExecuTorch) -// ============================================================ - -inline bool aoti_torch_grad_mode_is_enabled() { - return false; // ExecuTorch doesn't support autograd -} - -inline AOTITorchError aoti_torch_grad_mode_set_enabled(bool enabled) { - if (enabled) { - return Error::NotSupported; // Grad mode not supported in ExecuTorch - } - return Error::Ok; -} - -} // namespace aoti -} // namespace backends -} // namespace executorch diff --git a/backends/aoti/slim/c10/cuda/Exception.h b/backends/aoti/slim/c10/cuda/Exception.h index 33d8414e661..4db5781eb9f 100644 --- a/backends/aoti/slim/c10/cuda/Exception.h +++ b/backends/aoti/slim/c10/cuda/Exception.h @@ -19,12 +19,14 @@ /// Checks a CUDA expression and aborts on error. /// @param EXPR The CUDA expression to check. +#ifndef ET_CUDA_CHECK #define ET_CUDA_CHECK(EXPR) \ do { \ const cudaError_t __err = EXPR; \ ET_CHECK_MSG( \ __err == cudaSuccess, "CUDA error: %s", cudaGetErrorString(__err)); \ } while (0) +#endif /// Checks a CUDA expression and logs a warning on error (non-fatal). /// @param EXPR The CUDA expression to check. diff --git a/backends/aoti/targets.bzl b/backends/aoti/targets.bzl index 588dbc14831..ffe27e1d1e3 100644 --- a/backends/aoti/targets.bzl +++ b/backends/aoti/targets.bzl @@ -33,26 +33,22 @@ def define_common_targets(): ], ) - # AOTI common shims functionality + # AOTI common shims functionality (header-only library) + # The caller determines which tensor type is used by defining CUDA_AVAILABLE. + # - With CUDA_AVAILABLE=1: Uses SlimTensor + # - Without CUDA_AVAILABLE: Uses ETensor runtime.cxx_library( name = "common_shims", - srcs = [ - "common_shims.cpp", - ], headers = [ "common_shims.h", "export.h", "utils.h", ], - # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) - link_whole = True, - supports_python_dlopen = True, - # Constructor needed for backend registration. - compiler_flags = ["-Wno-global-constructors"], visibility = ["PUBLIC"], - deps = [ + exported_deps = [ "//executorch/runtime/core:core", "//executorch/runtime/core/exec_aten:lib", + "//executorch/backends/aoti/slim/core:slimtensor", ], ) @@ -86,21 +82,3 @@ def define_common_targets(): ":delegate_handle", ], ) - - # SlimTensor-based common shims (header-only library) - # The caller determines which tensor type is used by defining CUDA_AVAILABLE. - # - With CUDA_AVAILABLE=1: Uses SlimTensor - # - Without CUDA_AVAILABLE: Uses ETensor - runtime.cxx_library( - name = "common_shims_slim", - headers = [ - "common_shims_slim.h", - "export.h", - ], - visibility = ["@EXECUTORCH_CLIENTS"], - deps = [ - "//executorch/runtime/core:core", - "//executorch/runtime/core/exec_aten:lib", - "//executorch/backends/aoti/slim/core:slimtensor", - ], - ) diff --git a/backends/aoti/tests/TARGETS b/backends/aoti/tests/TARGETS index d92e0e32a1f..8b3e8a7f4b1 100644 --- a/backends/aoti/tests/TARGETS +++ b/backends/aoti/tests/TARGETS @@ -8,27 +8,8 @@ cpp_unittest( srcs = [ "test_common_shims.cpp", ], - headers = [ - "utils.h", - ], deps = [ "//executorch/backends/aoti:common_shims", - "//executorch/extension/tensor:tensor", - "//executorch/runtime/core:core", - "//executorch/runtime/platform:platform", - "//executorch/runtime/core/exec_aten/testing_util:tensor_util", - "//executorch/runtime/core/exec_aten:lib", - "//executorch/extension/tensor:tensor", - ], -) - -cpp_unittest( - name = "test_common_shims_slim", - srcs = [ - "test_common_shims_slim.cpp", - ], - deps = [ - "//executorch/backends/aoti:common_shims_slim", "//executorch/backends/aoti/slim/core:slimtensor", "//executorch/backends/aoti/slim/factory:empty", "//executorch/runtime/core:core", diff --git a/backends/aoti/tests/test_common_shims.cpp b/backends/aoti/tests/test_common_shims.cpp index 0fd1b057f99..3bc76e522cf 100644 --- a/backends/aoti/tests/test_common_shims.cpp +++ b/backends/aoti/tests/test_common_shims.cpp @@ -6,330 +6,627 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include -#include #include -#include #include +#include +#include +#include +#include +#include +#include + +#ifdef CUDA_AVAILABLE +#include +#endif + using namespace executorch::backends::aoti; -using namespace executorch::backends::aoti::test; -using namespace executorch::runtime; -using executorch::runtime::etensor::Tensor; +using executorch::runtime::Error; + +namespace slim_c10 = executorch::backends::aoti::slim::c10; +namespace slim = executorch::backends::aoti::slim; + +namespace { + +#ifdef CUDA_AVAILABLE +bool isCudaAvailable() { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + return (err == cudaSuccess && device_count > 0); +} +#endif + +// Helper to calculate contiguous strides from sizes +std::vector calculateContiguousStrides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; + } + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + return strides; +} + +} // namespace -// Test fixture for common shims tests -class CommonShimsTest : public ::testing::Test { +// Test fixture for common_shims_slim tests +class CommonShimsSlimTest : public ::testing::Test { protected: void SetUp() override { - // Clean up any existing cached metadata before each test - cleanup_tensor_metadata(); + et_pal_init(); } void TearDown() override { - // Clean up metadata and free any tensor data - cleanup_tensor_metadata(); - for (auto& tensor : test_tensors_) { - free_tensor_data(tensor.get()); + // Cleanup tracked tensors + for (Tensor* t : tensors_) { + delete t; } - test_tensors_.clear(); + tensors_.clear(); } - // Helper to create and track test tensors for cleanup - Tensor* create_tracked_tensor(const std::vector& sizes) { - auto tensor = create_test_tensor(sizes); - Tensor* ptr = tensor.get(); - test_tensors_.push_back(tensor); - return ptr; + void trackTensor(Tensor* t) { + if (t != nullptr) { + tensors_.push_back(t); + } + } + + Tensor* createTestTensor( + const std::vector& sizes, + slim_c10::DeviceType device_type) { + std::vector strides = calculateContiguousStrides(sizes); + slim_c10::Device device(device_type, 0); + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + trackTensor(tensor); + return tensor; } private: - std::vector> test_tensors_; + std::vector tensors_; }; -// Test aoti_torch_get_sizes basic functionality -TEST_F(CommonShimsTest, GetSizesBasicFunctionality) { - // Test 1D tensor - auto tensor_1d = create_tracked_tensor({5}); - int64_t* sizes_ptr; - AOTITorchError error = aoti_torch_get_sizes(tensor_1d, &sizes_ptr); +// ============================================================================ +// Common test body implementations - parameterized by device type +// ============================================================================ + +void runGetDataPtrTest(slim_c10::DeviceType device_type) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + slim_c10::Device device(device_type, 0); + + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + + void* data_ptr = nullptr; + AOTITorchError error = aoti_torch_get_data_ptr(tensor, &data_ptr); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(sizes_ptr, nullptr); - EXPECT_EQ(sizes_ptr[0], 5); + EXPECT_NE(data_ptr, nullptr); - // Test 2D tensor - auto tensor_2d = create_tracked_tensor({3, 4}); - error = aoti_torch_get_sizes(tensor_2d, &sizes_ptr); + // Verify the returned pointer matches tensor's data_ptr + EXPECT_EQ(data_ptr, tensor->data_ptr()); + + delete tensor; +} + +void runGetSizesTest(slim_c10::DeviceType device_type) { + std::vector sizes = {2, 3, 4}; + std::vector strides = calculateContiguousStrides(sizes); + slim_c10::Device device(device_type, 0); + + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + + int64_t* ret_sizes = nullptr; + AOTITorchError error = aoti_torch_get_sizes(tensor, &ret_sizes); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(sizes_ptr, nullptr); - EXPECT_EQ(sizes_ptr[0], 3); - EXPECT_EQ(sizes_ptr[1], 4); + EXPECT_NE(ret_sizes, nullptr); - // Test 3D tensor - auto tensor_3d = create_tracked_tensor({2, 3, 4}); - error = aoti_torch_get_sizes(tensor_3d, &sizes_ptr); + // Verify sizes match + EXPECT_EQ(ret_sizes[0], 2); + EXPECT_EQ(ret_sizes[1], 3); + EXPECT_EQ(ret_sizes[2], 4); + + delete tensor; +} + +void runGetStridesTest(slim_c10::DeviceType device_type) { + std::vector sizes = {2, 3, 4}; + std::vector strides = calculateContiguousStrides(sizes); + slim_c10::Device device(device_type, 0); + + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + + int64_t* ret_strides = nullptr; + AOTITorchError error = aoti_torch_get_strides(tensor, &ret_strides); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(sizes_ptr, nullptr); - EXPECT_EQ(sizes_ptr[0], 2); - EXPECT_EQ(sizes_ptr[1], 3); - EXPECT_EQ(sizes_ptr[2], 4); + EXPECT_NE(ret_strides, nullptr); + + // Verify strides match: [12, 4, 1] for contiguous [2, 3, 4] + EXPECT_EQ(ret_strides[0], 12); + EXPECT_EQ(ret_strides[1], 4); + EXPECT_EQ(ret_strides[2], 1); + + delete tensor; +} + +void runGetDtypeTest(slim_c10::DeviceType device_type) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + slim_c10::Device device(device_type, 0); + + // Test Float32 + { + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + + int32_t ret_dtype = -1; + AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype); + + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(ret_dtype, static_cast(slim_c10::ScalarType::Float)); + + delete tensor; + } + + // Test Int64 + { + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Long, + device)); + + int32_t ret_dtype = -1; + AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype); + + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(ret_dtype, static_cast(slim_c10::ScalarType::Long)); + + delete tensor; + } + + // Test BFloat16 + { + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::BFloat16, + device)); + + int32_t ret_dtype = -1; + AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype); + + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(ret_dtype, static_cast(slim_c10::ScalarType::BFloat16)); + + delete tensor; + } } -// Test aoti_torch_get_strides basic functionality -TEST_F(CommonShimsTest, GetStridesBasicFunctionality) { +void runGetDimTest(slim_c10::DeviceType device_type) { + slim_c10::Device device(device_type, 0); + + // Test 0D tensor (scalar) + { + std::vector sizes = {}; + std::vector strides = {}; + + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + + int64_t ret_dim = -1; + AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim); + + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(ret_dim, 0); + + delete tensor; + } + // Test 1D tensor - auto tensor_1d = create_tracked_tensor({5}); - int64_t* strides_ptr; - AOTITorchError error = aoti_torch_get_strides(tensor_1d, &strides_ptr); + { + std::vector sizes = {5}; + std::vector strides = calculateContiguousStrides(sizes); - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(strides_ptr, nullptr); - EXPECT_EQ(strides_ptr[0], 1); + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); - // Test 2D tensor - row major: [3, 4] should have strides [4, 1] - auto tensor_2d = create_tracked_tensor({3, 4}); - error = aoti_torch_get_strides(tensor_2d, &strides_ptr); + int64_t ret_dim = -1; + AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim); - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(strides_ptr, nullptr); - EXPECT_EQ(strides_ptr[0], 4); - EXPECT_EQ(strides_ptr[1], 1); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(ret_dim, 1); - // Test 3D tensor - row major: [2, 3, 4] should have strides [12, 4, 1] - auto tensor_3d = create_tracked_tensor({2, 3, 4}); - error = aoti_torch_get_strides(tensor_3d, &strides_ptr); + delete tensor; + } - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(strides_ptr, nullptr); - EXPECT_EQ(strides_ptr[0], 12); - EXPECT_EQ(strides_ptr[1], 4); - EXPECT_EQ(strides_ptr[2], 1); + // Test 3D tensor + { + std::vector sizes = {2, 3, 4}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + + int64_t ret_dim = -1; + AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim); + + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(ret_dim, 3); + + delete tensor; + } } -// Test caching logic for sizes -TEST_F(CommonShimsTest, SizesCachingLogic) { - auto tensor = create_tracked_tensor({2, 3, 4}); +// ============================================================================ +// Storage & Device Property Tests +// ============================================================================ + +void runGetStorageOffsetTest(slim_c10::DeviceType device_type) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + slim_c10::Device device(device_type, 0); + + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + + int64_t ret_storage_offset = -1; + AOTITorchError error = + aoti_torch_get_storage_offset(tensor, &ret_storage_offset); - // First call should cache the sizes - int64_t* sizes_ptr1; - AOTITorchError error = aoti_torch_get_sizes(tensor, &sizes_ptr1); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(sizes_ptr1, nullptr); + // Default storage offset for newly created tensor is 0 + EXPECT_EQ(ret_storage_offset, 0); + + delete tensor; +} + +void runGetStorageSizeTest(slim_c10::DeviceType device_type) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + slim_c10::Device device(device_type, 0); + + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + + int64_t ret_size = -1; + AOTITorchError error = aoti_torch_get_storage_size(tensor, &ret_size); - // Second call should return the same cached pointer - int64_t* sizes_ptr2; - error = aoti_torch_get_sizes(tensor, &sizes_ptr2); EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(sizes_ptr1, sizes_ptr2); // Should be the exact same pointer + // 2 * 3 * sizeof(float) = 6 * 4 = 24 bytes + EXPECT_EQ(ret_size, 24); - // Values should still be correct - EXPECT_EQ(sizes_ptr2[0], 2); - EXPECT_EQ(sizes_ptr2[1], 3); - EXPECT_EQ(sizes_ptr2[2], 4); + delete tensor; } -// Test caching logic for strides -TEST_F(CommonShimsTest, StridesCachingLogic) { - auto tensor = create_tracked_tensor({2, 3, 4}); +void runGetDeviceTypeTest(slim_c10::DeviceType device_type) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + slim_c10::Device device(device_type, 0); + + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + + int32_t ret_device_type = -1; + AOTITorchError error = aoti_torch_get_device_type(tensor, &ret_device_type); - // First call should cache the strides - int64_t* strides_ptr1; - AOTITorchError error = aoti_torch_get_strides(tensor, &strides_ptr1); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(strides_ptr1, nullptr); + EXPECT_EQ(ret_device_type, static_cast(device_type)); + + delete tensor; +} + +void runGetDeviceIndexTest(slim_c10::DeviceType device_type) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + slim_c10::Device device(device_type, 0); + + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + + int32_t ret_device_index = -1; + AOTITorchError error = aoti_torch_get_device_index(tensor, &ret_device_index); - // Second call should return the same cached pointer - int64_t* strides_ptr2; - error = aoti_torch_get_strides(tensor, &strides_ptr2); EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(strides_ptr1, strides_ptr2); // Should be the exact same pointer + EXPECT_EQ(ret_device_index, 0); - // Values should still be correct - EXPECT_EQ(strides_ptr2[0], 12); - EXPECT_EQ(strides_ptr2[1], 4); - EXPECT_EQ(strides_ptr2[2], 1); + delete tensor; } -// Test that different tensors have different cached entries -TEST_F(CommonShimsTest, DifferentTensorsCacheSeparately) { - auto tensor1 = create_tracked_tensor({2, 3}); - auto tensor2 = create_tracked_tensor({4, 5}); +// ============================================================================ +// CPU Tests +// ============================================================================ - // Get sizes for both tensors - int64_t* sizes1_ptr; - int64_t* sizes2_ptr; +TEST_F(CommonShimsSlimTest, GetDataPtr_CPU) { + runGetDataPtrTest(slim_c10::DeviceType::CPU); +} - EXPECT_EQ(aoti_torch_get_sizes(tensor1, &sizes1_ptr), Error::Ok); - EXPECT_EQ(aoti_torch_get_sizes(tensor2, &sizes2_ptr), Error::Ok); +TEST_F(CommonShimsSlimTest, GetSizes_CPU) { + runGetSizesTest(slim_c10::DeviceType::CPU); +} - // Pointers should be different (different cache entries) - EXPECT_NE(sizes1_ptr, sizes2_ptr); +TEST_F(CommonShimsSlimTest, GetStrides_CPU) { + runGetStridesTest(slim_c10::DeviceType::CPU); +} - // Values should be correct - EXPECT_EQ(sizes1_ptr[0], 2); - EXPECT_EQ(sizes1_ptr[1], 3); - EXPECT_EQ(sizes2_ptr[0], 4); - EXPECT_EQ(sizes2_ptr[1], 5); +TEST_F(CommonShimsSlimTest, GetDtype_CPU) { + runGetDtypeTest(slim_c10::DeviceType::CPU); +} - // Test strides as well - int64_t* strides1_ptr; - int64_t* strides2_ptr; +TEST_F(CommonShimsSlimTest, GetDim_CPU) { + runGetDimTest(slim_c10::DeviceType::CPU); +} - EXPECT_EQ(aoti_torch_get_strides(tensor1, &strides1_ptr), Error::Ok); - EXPECT_EQ(aoti_torch_get_strides(tensor2, &strides2_ptr), Error::Ok); +TEST_F(CommonShimsSlimTest, GetStorageOffset_CPU) { + runGetStorageOffsetTest(slim_c10::DeviceType::CPU); +} - // Pointers should be different (different cache entries) - EXPECT_NE(strides1_ptr, strides2_ptr); +TEST_F(CommonShimsSlimTest, GetStorageSize_CPU) { + runGetStorageSizeTest(slim_c10::DeviceType::CPU); +} - // Values should be correct - EXPECT_EQ(strides1_ptr[0], 3); - EXPECT_EQ(strides1_ptr[1], 1); - EXPECT_EQ(strides2_ptr[0], 5); - EXPECT_EQ(strides2_ptr[1], 1); +TEST_F(CommonShimsSlimTest, GetDeviceType_CPU) { + runGetDeviceTypeTest(slim_c10::DeviceType::CPU); } -// Test cache persistence across multiple calls -TEST_F(CommonShimsTest, CachePersistence) { - auto tensor = create_tracked_tensor({3, 4, 5}); +TEST_F(CommonShimsSlimTest, GetDeviceIndex_CPU) { + runGetDeviceIndexTest(slim_c10::DeviceType::CPU); +} - // Multiple calls to sizes should all return the same pointer - int64_t* sizes_ptr1; - int64_t* sizes_ptr2; - int64_t* sizes_ptr3; +// ============================================================================ +// CUDA Tests +// ============================================================================ - EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok); - EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok); - EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr3), Error::Ok); +#ifdef CUDA_AVAILABLE +TEST_F(CommonShimsSlimTest, GetDataPtr_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runGetDataPtrTest(slim_c10::DeviceType::CUDA); +} - EXPECT_EQ(sizes_ptr1, sizes_ptr2); - EXPECT_EQ(sizes_ptr2, sizes_ptr3); +TEST_F(CommonShimsSlimTest, GetSizes_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runGetSizesTest(slim_c10::DeviceType::CUDA); +} - // Multiple calls to strides should all return the same pointer - int64_t* strides_ptr1; - int64_t* strides_ptr2; - int64_t* strides_ptr3; +TEST_F(CommonShimsSlimTest, GetStrides_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runGetStridesTest(slim_c10::DeviceType::CUDA); +} - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok); - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok); - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr3), Error::Ok); +TEST_F(CommonShimsSlimTest, GetDtype_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runGetDtypeTest(slim_c10::DeviceType::CUDA); +} - EXPECT_EQ(strides_ptr1, strides_ptr2); - EXPECT_EQ(strides_ptr2, strides_ptr3); +TEST_F(CommonShimsSlimTest, GetDim_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runGetDimTest(slim_c10::DeviceType::CUDA); } -// Test 0D tensor (scalar) -TEST_F(CommonShimsTest, ScalarTensor) { - auto tensor_0d = create_tracked_tensor({}); +TEST_F(CommonShimsSlimTest, GetStorageOffset_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runGetStorageOffsetTest(slim_c10::DeviceType::CUDA); +} - // Test sizes for 0D tensor - int64_t* sizes_ptr; - AOTITorchError error = aoti_torch_get_sizes(tensor_0d, &sizes_ptr); - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(sizes_ptr, nullptr); +TEST_F(CommonShimsSlimTest, GetStorageSize_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runGetStorageSizeTest(slim_c10::DeviceType::CUDA); +} - // Test strides for 0D tensor - int64_t* strides_ptr; - error = aoti_torch_get_strides(tensor_0d, &strides_ptr); - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(strides_ptr, nullptr); +TEST_F(CommonShimsSlimTest, GetDeviceType_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runGetDeviceTypeTest(slim_c10::DeviceType::CUDA); +} - // Cache should work for 0D tensors too - int64_t* sizes_ptr2; - error = aoti_torch_get_sizes(tensor_0d, &sizes_ptr2); - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(sizes_ptr, sizes_ptr2); +TEST_F(CommonShimsSlimTest, GetDeviceIndex_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runGetDeviceIndexTest(slim_c10::DeviceType::CUDA); +} +#endif + +// ============================================================================ +// Error Cases +// ============================================================================ + +TEST_F(CommonShimsSlimTest, NullTensorArgument) { + void* data_ptr = nullptr; + int64_t* sizes = nullptr; + int64_t* strides = nullptr; + int32_t dtype = -1; + int64_t dim = -1; + + EXPECT_EQ( + aoti_torch_get_data_ptr(nullptr, &data_ptr), Error::InvalidArgument); + EXPECT_EQ(aoti_torch_get_sizes(nullptr, &sizes), Error::InvalidArgument); + EXPECT_EQ(aoti_torch_get_strides(nullptr, &strides), Error::InvalidArgument); + EXPECT_EQ(aoti_torch_get_dtype(nullptr, &dtype), Error::InvalidArgument); + EXPECT_EQ(aoti_torch_get_dim(nullptr, &dim), Error::InvalidArgument); } -// Test large tensor dimensions -TEST_F(CommonShimsTest, LargeTensorDimensions) { - auto tensor = create_tracked_tensor({100, 200, 300, 400}); +TEST_F(CommonShimsSlimTest, NullReturnPointer) { + Tensor* tensor = createTestTensor({2, 3}, slim_c10::DeviceType::CPU); - // Test sizes - int64_t* sizes_ptr; - AOTITorchError error = aoti_torch_get_sizes(tensor, &sizes_ptr); - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(sizes_ptr, nullptr); - EXPECT_EQ(sizes_ptr[0], 100); - EXPECT_EQ(sizes_ptr[1], 200); - EXPECT_EQ(sizes_ptr[2], 300); - EXPECT_EQ(sizes_ptr[3], 400); - - // Test strides - expected: [24000000, 120000, 400, 1] - int64_t* strides_ptr; - error = aoti_torch_get_strides(tensor, &strides_ptr); - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(strides_ptr, nullptr); - EXPECT_EQ(strides_ptr[0], 24000000); - EXPECT_EQ(strides_ptr[1], 120000); - EXPECT_EQ(strides_ptr[2], 400); - EXPECT_EQ(strides_ptr[3], 1); + EXPECT_EQ(aoti_torch_get_data_ptr(tensor, nullptr), Error::InvalidArgument); + EXPECT_EQ(aoti_torch_get_sizes(tensor, nullptr), Error::InvalidArgument); + EXPECT_EQ(aoti_torch_get_strides(tensor, nullptr), Error::InvalidArgument); + EXPECT_EQ(aoti_torch_get_dtype(tensor, nullptr), Error::InvalidArgument); + EXPECT_EQ(aoti_torch_get_dim(tensor, nullptr), Error::InvalidArgument); } -// Test that cleanup_tensor_metadata clears the cache -TEST_F(CommonShimsTest, CleanupFunctionality) { - auto tensor = create_tracked_tensor({2, 3}); +// ============================================================================ +// Edge Cases +// ============================================================================ - // Cache some data - int64_t* sizes_ptr1; - int64_t* strides_ptr1; - EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok); - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok); +TEST_F(CommonShimsSlimTest, ScalarTensor) { + std::vector sizes = {}; + std::vector strides = {}; + slim_c10::Device device(slim_c10::DeviceType::CPU, 0); - // Clear the cache - cleanup_tensor_metadata(); + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + trackTensor(tensor); - // Getting sizes/strides again should create new cache entries - // (We can't directly test if the pointers are different since that would be - // implementation-dependent, but we can at least verify the functions still - // work) - int64_t* sizes_ptr2; - int64_t* strides_ptr2; - EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok); - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok); + // Get sizes and strides for 0D tensor + int64_t* ret_sizes = nullptr; + int64_t* ret_strides = nullptr; + int64_t ret_dim = -1; + + EXPECT_EQ(aoti_torch_get_sizes(tensor, &ret_sizes), Error::Ok); + EXPECT_NE(ret_sizes, nullptr); + + EXPECT_EQ(aoti_torch_get_strides(tensor, &ret_strides), Error::Ok); + EXPECT_NE(ret_strides, nullptr); - // Values should still be correct - EXPECT_EQ(sizes_ptr2[0], 2); - EXPECT_EQ(sizes_ptr2[1], 3); - EXPECT_EQ(strides_ptr2[0], 3); - EXPECT_EQ(strides_ptr2[1], 1); + EXPECT_EQ(aoti_torch_get_dim(tensor, &ret_dim), Error::Ok); + EXPECT_EQ(ret_dim, 0); } -// Test mixed operations to ensure caches are independent -TEST_F(CommonShimsTest, IndependentCaches) { - auto tensor = create_tracked_tensor({2, 3, 4}); +TEST_F(CommonShimsSlimTest, LargeTensor) { + std::vector sizes = {100, 200, 300}; + std::vector strides = calculateContiguousStrides(sizes); + slim_c10::Device device(slim_c10::DeviceType::CPU, 0); + + Tensor* tensor = new Tensor(slim::empty_strided( + slim::makeArrayRef(sizes), + slim::makeArrayRef(strides), + slim_c10::ScalarType::Float, + device)); + trackTensor(tensor); + + int64_t* ret_sizes = nullptr; + int64_t* ret_strides = nullptr; + + EXPECT_EQ(aoti_torch_get_sizes(tensor, &ret_sizes), Error::Ok); + EXPECT_EQ(ret_sizes[0], 100); + EXPECT_EQ(ret_sizes[1], 200); + EXPECT_EQ(ret_sizes[2], 300); + + EXPECT_EQ(aoti_torch_get_strides(tensor, &ret_strides), Error::Ok); + EXPECT_EQ(ret_strides[0], 60000); // 200 * 300 + EXPECT_EQ(ret_strides[1], 300); // 300 + EXPECT_EQ(ret_strides[2], 1); +} - // Get sizes first - int64_t* sizes_ptr1; - EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok); +TEST_F(CommonShimsSlimTest, ConsistentPointerReturn) { + Tensor* tensor = createTestTensor({2, 3, 4}, slim_c10::DeviceType::CPU); - // Get strides - int64_t* strides_ptr1; - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok); + // Multiple calls should return the same pointer (for SlimTensor) + int64_t* sizes_ptr1 = nullptr; + int64_t* sizes_ptr2 = nullptr; - // Get sizes again - should be cached - int64_t* sizes_ptr2; + EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok); EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok); EXPECT_EQ(sizes_ptr1, sizes_ptr2); - // Get strides again - should be cached - int64_t* strides_ptr2; + int64_t* strides_ptr1 = nullptr; + int64_t* strides_ptr2 = nullptr; + + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok); EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok); EXPECT_EQ(strides_ptr1, strides_ptr2); +} + +// ============================================================================ +// DType Constants Tests +// ============================================================================ + +TEST_F(CommonShimsSlimTest, DTypeConstants) { + // Verify dtype constants match expected PyTorch ScalarType values + EXPECT_EQ(aoti_torch_dtype_float32(), 6); // ScalarType::Float + EXPECT_EQ(aoti_torch_dtype_bfloat16(), 15); // ScalarType::BFloat16 + EXPECT_EQ(aoti_torch_dtype_int64(), 4); // ScalarType::Long + EXPECT_EQ(aoti_torch_dtype_int32(), 3); // ScalarType::Int + EXPECT_EQ(aoti_torch_dtype_int16(), 2); // ScalarType::Short + EXPECT_EQ(aoti_torch_dtype_int8(), 1); // ScalarType::Char + EXPECT_EQ(aoti_torch_dtype_bool(), 11); // ScalarType::Bool +} + +// ============================================================================ +// Device Type Constants Tests +// ============================================================================ - // Sizes and strides pointers should be different (different caches) - EXPECT_NE(sizes_ptr1, strides_ptr1); +TEST_F(CommonShimsSlimTest, DeviceTypeConstants) { + EXPECT_EQ(aoti_torch_device_type_cpu(), 0); // DeviceType::CPU + EXPECT_EQ(aoti_torch_device_type_cuda(), 1); // DeviceType::CUDA } -// Test all dtype functions return correct PyTorch dtype codes -TEST_F(CommonShimsTest, AllDtypesReturnCorrectValues) { - EXPECT_EQ(aoti_torch_dtype_float32(), 6); // PyTorch's float32 dtype code - EXPECT_EQ(aoti_torch_dtype_bfloat16(), 15); // PyTorch's bfloat16 dtype code - EXPECT_EQ(aoti_torch_dtype_int8(), 1); // PyTorch's int8 dtype code - EXPECT_EQ(aoti_torch_dtype_int16(), 2); // PyTorch's int16 dtype code - EXPECT_EQ(aoti_torch_dtype_int32(), 3); // PyTorch's int32 dtype code - EXPECT_EQ(aoti_torch_dtype_int64(), 4); // PyTorch's int64 dtype code - EXPECT_EQ(aoti_torch_dtype_bool(), 11); // PyTorch's bool dtype code +// ============================================================================ +// Grad Mode Tests +// ============================================================================ + +TEST_F(CommonShimsSlimTest, GradModeIsEnabled) { + // ExecuTorch doesn't support autograd, so should always return false + EXPECT_EQ(aoti_torch_grad_mode_is_enabled(), false); +} + +TEST_F(CommonShimsSlimTest, GradModeSetEnabled) { + // Setting to false should succeed + EXPECT_EQ(aoti_torch_grad_mode_set_enabled(false), Error::Ok); + + // Setting to true should fail (not supported in ExecuTorch) + EXPECT_EQ(aoti_torch_grad_mode_set_enabled(true), Error::NotSupported); } diff --git a/backends/aoti/tests/test_common_shims_slim.cpp b/backends/aoti/tests/test_common_shims_slim.cpp deleted file mode 100644 index 94319c6f94d..00000000000 --- a/backends/aoti/tests/test_common_shims_slim.cpp +++ /dev/null @@ -1,632 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include - -#ifdef CUDA_AVAILABLE -#include -#endif - -using namespace executorch::backends::aoti; -using executorch::runtime::Error; - -namespace slim_c10 = executorch::backends::aoti::slim::c10; -namespace slim = executorch::backends::aoti::slim; - -namespace { - -#ifdef CUDA_AVAILABLE -bool isCudaAvailable() { - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - return (err == cudaSuccess && device_count > 0); -} -#endif - -// Helper to calculate contiguous strides from sizes -std::vector calculateContiguousStrides( - const std::vector& sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) { - return strides; - } - strides[sizes.size() - 1] = 1; - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - return strides; -} - -} // namespace - -// Test fixture for common_shims_slim tests -class CommonShimsSlimTest : public ::testing::Test { - protected: - void SetUp() override { - et_pal_init(); - } - - void TearDown() override { - // Cleanup tracked tensors - for (Tensor* t : tensors_) { - delete t; - } - tensors_.clear(); - } - - void trackTensor(Tensor* t) { - if (t != nullptr) { - tensors_.push_back(t); - } - } - - Tensor* createTestTensor( - const std::vector& sizes, - slim_c10::DeviceType device_type) { - std::vector strides = calculateContiguousStrides(sizes); - slim_c10::Device device(device_type, 0); - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - trackTensor(tensor); - return tensor; - } - - private: - std::vector tensors_; -}; - -// ============================================================================ -// Common test body implementations - parameterized by device type -// ============================================================================ - -void runGetDataPtrTest(slim_c10::DeviceType device_type) { - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - slim_c10::Device device(device_type, 0); - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - - void* data_ptr = nullptr; - AOTITorchError error = aoti_torch_get_data_ptr(tensor, &data_ptr); - - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(data_ptr, nullptr); - - // Verify the returned pointer matches tensor's data_ptr - EXPECT_EQ(data_ptr, tensor->data_ptr()); - - delete tensor; -} - -void runGetSizesTest(slim_c10::DeviceType device_type) { - std::vector sizes = {2, 3, 4}; - std::vector strides = calculateContiguousStrides(sizes); - slim_c10::Device device(device_type, 0); - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - - int64_t* ret_sizes = nullptr; - AOTITorchError error = aoti_torch_get_sizes(tensor, &ret_sizes); - - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(ret_sizes, nullptr); - - // Verify sizes match - EXPECT_EQ(ret_sizes[0], 2); - EXPECT_EQ(ret_sizes[1], 3); - EXPECT_EQ(ret_sizes[2], 4); - - delete tensor; -} - -void runGetStridesTest(slim_c10::DeviceType device_type) { - std::vector sizes = {2, 3, 4}; - std::vector strides = calculateContiguousStrides(sizes); - slim_c10::Device device(device_type, 0); - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - - int64_t* ret_strides = nullptr; - AOTITorchError error = aoti_torch_get_strides(tensor, &ret_strides); - - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(ret_strides, nullptr); - - // Verify strides match: [12, 4, 1] for contiguous [2, 3, 4] - EXPECT_EQ(ret_strides[0], 12); - EXPECT_EQ(ret_strides[1], 4); - EXPECT_EQ(ret_strides[2], 1); - - delete tensor; -} - -void runGetDtypeTest(slim_c10::DeviceType device_type) { - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - slim_c10::Device device(device_type, 0); - - // Test Float32 - { - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - - int32_t ret_dtype = -1; - AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(ret_dtype, static_cast(slim_c10::ScalarType::Float)); - - delete tensor; - } - - // Test Int64 - { - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Long, - device)); - - int32_t ret_dtype = -1; - AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(ret_dtype, static_cast(slim_c10::ScalarType::Long)); - - delete tensor; - } - - // Test BFloat16 - { - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::BFloat16, - device)); - - int32_t ret_dtype = -1; - AOTITorchError error = aoti_torch_get_dtype(tensor, &ret_dtype); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(ret_dtype, static_cast(slim_c10::ScalarType::BFloat16)); - - delete tensor; - } -} - -void runGetDimTest(slim_c10::DeviceType device_type) { - slim_c10::Device device(device_type, 0); - - // Test 0D tensor (scalar) - { - std::vector sizes = {}; - std::vector strides = {}; - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - - int64_t ret_dim = -1; - AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(ret_dim, 0); - - delete tensor; - } - - // Test 1D tensor - { - std::vector sizes = {5}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - - int64_t ret_dim = -1; - AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(ret_dim, 1); - - delete tensor; - } - - // Test 3D tensor - { - std::vector sizes = {2, 3, 4}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - - int64_t ret_dim = -1; - AOTITorchError error = aoti_torch_get_dim(tensor, &ret_dim); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(ret_dim, 3); - - delete tensor; - } -} - -// ============================================================================ -// Storage & Device Property Tests -// ============================================================================ - -void runGetStorageOffsetTest(slim_c10::DeviceType device_type) { - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - slim_c10::Device device(device_type, 0); - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - - int64_t ret_storage_offset = -1; - AOTITorchError error = - aoti_torch_get_storage_offset(tensor, &ret_storage_offset); - - EXPECT_EQ(error, Error::Ok); - // Default storage offset for newly created tensor is 0 - EXPECT_EQ(ret_storage_offset, 0); - - delete tensor; -} - -void runGetStorageSizeTest(slim_c10::DeviceType device_type) { - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - slim_c10::Device device(device_type, 0); - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - - int64_t ret_size = -1; - AOTITorchError error = aoti_torch_get_storage_size(tensor, &ret_size); - - EXPECT_EQ(error, Error::Ok); - // 2 * 3 * sizeof(float) = 6 * 4 = 24 bytes - EXPECT_EQ(ret_size, 24); - - delete tensor; -} - -void runGetDeviceTypeTest(slim_c10::DeviceType device_type) { - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - slim_c10::Device device(device_type, 0); - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - - int32_t ret_device_type = -1; - AOTITorchError error = aoti_torch_get_device_type(tensor, &ret_device_type); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(ret_device_type, static_cast(device_type)); - - delete tensor; -} - -void runGetDeviceIndexTest(slim_c10::DeviceType device_type) { - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - slim_c10::Device device(device_type, 0); - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - - int32_t ret_device_index = -1; - AOTITorchError error = aoti_torch_get_device_index(tensor, &ret_device_index); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(ret_device_index, 0); - - delete tensor; -} - -// ============================================================================ -// CPU Tests -// ============================================================================ - -TEST_F(CommonShimsSlimTest, GetDataPtr_CPU) { - runGetDataPtrTest(slim_c10::DeviceType::CPU); -} - -TEST_F(CommonShimsSlimTest, GetSizes_CPU) { - runGetSizesTest(slim_c10::DeviceType::CPU); -} - -TEST_F(CommonShimsSlimTest, GetStrides_CPU) { - runGetStridesTest(slim_c10::DeviceType::CPU); -} - -TEST_F(CommonShimsSlimTest, GetDtype_CPU) { - runGetDtypeTest(slim_c10::DeviceType::CPU); -} - -TEST_F(CommonShimsSlimTest, GetDim_CPU) { - runGetDimTest(slim_c10::DeviceType::CPU); -} - -TEST_F(CommonShimsSlimTest, GetStorageOffset_CPU) { - runGetStorageOffsetTest(slim_c10::DeviceType::CPU); -} - -TEST_F(CommonShimsSlimTest, GetStorageSize_CPU) { - runGetStorageSizeTest(slim_c10::DeviceType::CPU); -} - -TEST_F(CommonShimsSlimTest, GetDeviceType_CPU) { - runGetDeviceTypeTest(slim_c10::DeviceType::CPU); -} - -TEST_F(CommonShimsSlimTest, GetDeviceIndex_CPU) { - runGetDeviceIndexTest(slim_c10::DeviceType::CPU); -} - -// ============================================================================ -// CUDA Tests -// ============================================================================ - -#ifdef CUDA_AVAILABLE -TEST_F(CommonShimsSlimTest, GetDataPtr_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runGetDataPtrTest(slim_c10::DeviceType::CUDA); -} - -TEST_F(CommonShimsSlimTest, GetSizes_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runGetSizesTest(slim_c10::DeviceType::CUDA); -} - -TEST_F(CommonShimsSlimTest, GetStrides_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runGetStridesTest(slim_c10::DeviceType::CUDA); -} - -TEST_F(CommonShimsSlimTest, GetDtype_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runGetDtypeTest(slim_c10::DeviceType::CUDA); -} - -TEST_F(CommonShimsSlimTest, GetDim_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runGetDimTest(slim_c10::DeviceType::CUDA); -} - -TEST_F(CommonShimsSlimTest, GetStorageOffset_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runGetStorageOffsetTest(slim_c10::DeviceType::CUDA); -} - -TEST_F(CommonShimsSlimTest, GetStorageSize_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runGetStorageSizeTest(slim_c10::DeviceType::CUDA); -} - -TEST_F(CommonShimsSlimTest, GetDeviceType_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runGetDeviceTypeTest(slim_c10::DeviceType::CUDA); -} - -TEST_F(CommonShimsSlimTest, GetDeviceIndex_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runGetDeviceIndexTest(slim_c10::DeviceType::CUDA); -} -#endif - -// ============================================================================ -// Error Cases -// ============================================================================ - -TEST_F(CommonShimsSlimTest, NullTensorArgument) { - void* data_ptr = nullptr; - int64_t* sizes = nullptr; - int64_t* strides = nullptr; - int32_t dtype = -1; - int64_t dim = -1; - - EXPECT_EQ( - aoti_torch_get_data_ptr(nullptr, &data_ptr), Error::InvalidArgument); - EXPECT_EQ(aoti_torch_get_sizes(nullptr, &sizes), Error::InvalidArgument); - EXPECT_EQ(aoti_torch_get_strides(nullptr, &strides), Error::InvalidArgument); - EXPECT_EQ(aoti_torch_get_dtype(nullptr, &dtype), Error::InvalidArgument); - EXPECT_EQ(aoti_torch_get_dim(nullptr, &dim), Error::InvalidArgument); -} - -TEST_F(CommonShimsSlimTest, NullReturnPointer) { - Tensor* tensor = createTestTensor({2, 3}, slim_c10::DeviceType::CPU); - - EXPECT_EQ(aoti_torch_get_data_ptr(tensor, nullptr), Error::InvalidArgument); - EXPECT_EQ(aoti_torch_get_sizes(tensor, nullptr), Error::InvalidArgument); - EXPECT_EQ(aoti_torch_get_strides(tensor, nullptr), Error::InvalidArgument); - EXPECT_EQ(aoti_torch_get_dtype(tensor, nullptr), Error::InvalidArgument); - EXPECT_EQ(aoti_torch_get_dim(tensor, nullptr), Error::InvalidArgument); -} - -// ============================================================================ -// Edge Cases -// ============================================================================ - -TEST_F(CommonShimsSlimTest, ScalarTensor) { - std::vector sizes = {}; - std::vector strides = {}; - slim_c10::Device device(slim_c10::DeviceType::CPU, 0); - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - trackTensor(tensor); - - // Get sizes and strides for 0D tensor - int64_t* ret_sizes = nullptr; - int64_t* ret_strides = nullptr; - int64_t ret_dim = -1; - - EXPECT_EQ(aoti_torch_get_sizes(tensor, &ret_sizes), Error::Ok); - EXPECT_NE(ret_sizes, nullptr); - - EXPECT_EQ(aoti_torch_get_strides(tensor, &ret_strides), Error::Ok); - EXPECT_NE(ret_strides, nullptr); - - EXPECT_EQ(aoti_torch_get_dim(tensor, &ret_dim), Error::Ok); - EXPECT_EQ(ret_dim, 0); -} - -TEST_F(CommonShimsSlimTest, LargeTensor) { - std::vector sizes = {100, 200, 300}; - std::vector strides = calculateContiguousStrides(sizes); - slim_c10::Device device(slim_c10::DeviceType::CPU, 0); - - Tensor* tensor = new Tensor(slim::empty_strided( - slim::makeArrayRef(sizes), - slim::makeArrayRef(strides), - slim_c10::ScalarType::Float, - device)); - trackTensor(tensor); - - int64_t* ret_sizes = nullptr; - int64_t* ret_strides = nullptr; - - EXPECT_EQ(aoti_torch_get_sizes(tensor, &ret_sizes), Error::Ok); - EXPECT_EQ(ret_sizes[0], 100); - EXPECT_EQ(ret_sizes[1], 200); - EXPECT_EQ(ret_sizes[2], 300); - - EXPECT_EQ(aoti_torch_get_strides(tensor, &ret_strides), Error::Ok); - EXPECT_EQ(ret_strides[0], 60000); // 200 * 300 - EXPECT_EQ(ret_strides[1], 300); // 300 - EXPECT_EQ(ret_strides[2], 1); -} - -TEST_F(CommonShimsSlimTest, ConsistentPointerReturn) { - Tensor* tensor = createTestTensor({2, 3, 4}, slim_c10::DeviceType::CPU); - - // Multiple calls should return the same pointer (for SlimTensor) - int64_t* sizes_ptr1 = nullptr; - int64_t* sizes_ptr2 = nullptr; - - EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok); - EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok); - EXPECT_EQ(sizes_ptr1, sizes_ptr2); - - int64_t* strides_ptr1 = nullptr; - int64_t* strides_ptr2 = nullptr; - - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok); - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok); - EXPECT_EQ(strides_ptr1, strides_ptr2); -} - -// ============================================================================ -// DType Constants Tests -// ============================================================================ - -TEST_F(CommonShimsSlimTest, DTypeConstants) { - // Verify dtype constants match expected PyTorch ScalarType values - EXPECT_EQ(aoti_torch_dtype_float32(), 6); // ScalarType::Float - EXPECT_EQ(aoti_torch_dtype_bfloat16(), 15); // ScalarType::BFloat16 - EXPECT_EQ(aoti_torch_dtype_int64(), 4); // ScalarType::Long - EXPECT_EQ(aoti_torch_dtype_int32(), 3); // ScalarType::Int - EXPECT_EQ(aoti_torch_dtype_int16(), 2); // ScalarType::Short - EXPECT_EQ(aoti_torch_dtype_int8(), 1); // ScalarType::Char - EXPECT_EQ(aoti_torch_dtype_bool(), 11); // ScalarType::Bool -} - -// ============================================================================ -// Device Type Constants Tests -// ============================================================================ - -TEST_F(CommonShimsSlimTest, DeviceTypeConstants) { - EXPECT_EQ(aoti_torch_device_type_cpu(), 0); // DeviceType::CPU - EXPECT_EQ(aoti_torch_device_type_cuda(), 1); // DeviceType::CUDA -} - -// ============================================================================ -// Grad Mode Tests -// ============================================================================ - -TEST_F(CommonShimsSlimTest, GradModeIsEnabled) { - // ExecuTorch doesn't support autograd, so should always return false - EXPECT_EQ(aoti_torch_grad_mode_is_enabled(), false); -} - -TEST_F(CommonShimsSlimTest, GradModeSetEnabled) { - // Setting to false should succeed - EXPECT_EQ(aoti_torch_grad_mode_set_enabled(false), Error::Ok); - - // Setting to true should fail (not supported in ExecuTorch) - EXPECT_EQ(aoti_torch_grad_mode_set_enabled(true), Error::NotSupported); -} diff --git a/backends/aoti/tests/utils.h b/backends/aoti/tests/utils.h deleted file mode 100644 index 1f26f7e2d51..00000000000 --- a/backends/aoti/tests/utils.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include - -namespace executorch { -namespace backends { -namespace aoti { -namespace test { - -// Use the same type aliases as in common_shims.h -using executorch::runtime::etensor::Tensor; - -/** - * Creates a test tensor with the specified shape and scalar type - */ -inline std::shared_ptr create_test_tensor( - const std::vector& sizes, - exec_aten::ScalarType dtype = exec_aten::ScalarType::Float) { - // Calculate total number of elements - int64_t total_elements = 1; - for (int64_t size : sizes) { - total_elements *= size; - } - - // Calculate strides (row-major layout) - std::vector strides(sizes.size()); - if (sizes.size() > 0) { - strides[sizes.size() - 1] = 1; - for (int i = sizes.size() - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - } - - // Allocate data buffer - size_t dtype_size = exec_aten::elementSize(dtype); - void* data = malloc(total_elements * dtype_size); - - // Convert sizes and strides to the required type - std::vector sizes_converted( - sizes.begin(), sizes.end()); - std::vector strides_converted( - strides.begin(), strides.end()); - - // Create the tensor with the correct argument types and count - auto tensor = executorch::extension::from_blob( - data, sizes_converted, strides_converted, dtype); - - return tensor; -} - -/** - * Helper to clean up tensor data that was allocated with malloc - */ -inline void free_tensor_data(Tensor* tensor) { - if (tensor && tensor->mutable_data_ptr()) { - free(tensor->mutable_data_ptr()); - } -} - -} // namespace test -} // namespace aoti -} // namespace backends -} // namespace executorch diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index c85e07d4b59..1cb8bf78c4c 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -98,6 +98,7 @@ install( ) # CUDA-specific AOTI shim symbols (dynamically linked) +# Note: common_shims.h is header-only (all functions are inline) set(_aoti_cuda_shim_sources runtime/shims/memory.cpp runtime/shims/tensor_attribute.cpp runtime/guard.cpp runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu @@ -106,6 +107,9 @@ set(_aoti_cuda_shim_sources add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources}) +# Define CUDA_AVAILABLE to use SlimTensor in common_shims.h +target_compile_definitions(aoti_cuda_shims PRIVATE CUDA_AVAILABLE=1) + # Define export macros for shared library if(MSVC) target_compile_definitions(aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS) diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS index ad5baa8d83f..0d2e14248df 100644 --- a/backends/cuda/runtime/TARGETS +++ b/backends/cuda/runtime/TARGETS @@ -71,34 +71,34 @@ runtime.cxx_library( runtime.cxx_library( name = "runtime_shims", srcs = [ - "guard.cpp", "shims/cuda_guard.cpp", "shims/int4mm.cu", "shims/memory.cpp", "shims/tensor_attribute.cpp", ], headers = [ - "guard.h", "shims/cuda_guard.h", "shims/int4mm.cuh", "shims/int4mm.h", "shims/memory.h", "shims/tensor_attribute.h", - "utils.h", ], # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) link_whole = True, supports_python_dlopen = True, # Constructor needed for backend registration. compiler_flags = ["-Wno-global-constructors"], + preprocessor_flags = ["-DCUDA_AVAILABLE=1"], visibility = ["PUBLIC"], deps = [ ":tensor_maker", "//executorch/backends/aoti:common_shims", + "//executorch/backends/aoti/slim/core:slimtensor", + "//executorch/backends/aoti/slim/factory:empty", + "//executorch/backends/aoti/slim/factory:from_blob", "//executorch/runtime/core:core", "//executorch/runtime/core/exec_aten:lib", "//executorch/runtime/platform:platform", - "//executorch/backends/cuda/runtime:cuda_platform", ], nvcc_flags = get_nvcc_arch_args() + [ "-_NVCC_HOST_COMPILER_FLAG_", @@ -109,33 +109,12 @@ runtime.cxx_library( ], ) +# Legacy alias for backward compatibility runtime.cxx_library( name = "runtime_shims_slim", - srcs = [ - "shims/memory_slim.cpp", - ], - headers = [ - "shims/memory_slim.h", - ], - # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) - link_whole = True, - supports_python_dlopen = True, - visibility = ["@EXECUTORCH_CLIENTS"], - preprocessor_flags = ["-DCUDA_AVAILABLE=1"], - deps = [ - "//executorch/backends/aoti/slim/core:slimtensor", - "//executorch/backends/aoti/slim/factory:empty", - "//executorch/backends/aoti/slim/factory:from_blob", - "//executorch/backends/aoti:common_shims", - "//executorch/runtime/core:core", - "//executorch/runtime/platform:platform", - ], - nvcc_flags = get_nvcc_arch_args() + [ - "-_NVCC_HOST_COMPILER_FLAG_", - "gcc", - ], - external_deps = [ - ("cuda", None, "cuda-lazy"), + visibility = ["PUBLIC"], + exported_deps = [ + ":runtime_shims", ], ) @@ -149,10 +128,16 @@ runtime.cxx_library( supports_python_dlopen = True, # Constructor needed for backend registration. compiler_flags = ["-Wno-global-constructors"], + preprocessor_flags = ["-DCUDA_AVAILABLE=1"], visibility = ["PUBLIC"], deps = [ - ":runtime_shims", + ":runtime_shims_slim", "//executorch/backends/aoti:aoti_common", + "//executorch/backends/aoti/slim/core:slimtensor", + "//executorch/backends/aoti/slim/factory:empty", + "//executorch/backends/aoti/slim/factory:from_blob", + "//executorch/backends/aoti/slim/factory:from_etensor", + "//executorch/extension/tensor:tensor", "//executorch/runtime/backend:interface", "//executorch/runtime/core/exec_aten/util:tensor_util", ], diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index cd1c6b96f02..4f3bdf6321a 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -19,8 +19,19 @@ #include #include #include +#include #include +// Include SlimTensor headers for CUDA backend +#include +#include +#include +#include +#include +#include +#include +#include + // Include our shim layer headers #include #include @@ -52,10 +63,113 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; +// SlimTensor type aliases +using slim::c10::Device; +using slim::c10::DeviceType; +using slim::CPU_DEVICE; +using slim::DEFAULT_CUDA_DEVICE; +using slim::DeviceTraits; +using slim::from_etensor; +using slim::SlimTensor; + namespace { constexpr char kSkipCopyOutputToCpuForMethod[] = "skip_copy_output_to_cpu_for_method"; + +/** + * Copies data from a SlimTensor to an ETensor. + * + * This function converts a SlimTensor back to an ETensor. The ETensor is + * assumed to always reside on CPU, so this handles both CPU→CPU and GPU→CPU + * copies. The function will resize the ETensor if needed and copy the data. + * + * @param slim_tensor Pointer to the source SlimTensor (must not be null). + * @param etensor Pointer to the destination ETensor (must not be null). + * @return Error::Ok on success, or an appropriate error code on failure. + */ +inline Error copy_slimtensor_to_etensor( + const SlimTensor* slim_tensor, + Tensor* etensor) { + ET_CHECK_OR_RETURN_ERROR( + slim_tensor != nullptr, + InvalidArgument, + "copy_slimtensor_to_etensor: slim_tensor pointer cannot be nullptr"); + + ET_CHECK_OR_RETURN_ERROR( + etensor != nullptr, + InvalidArgument, + "copy_slimtensor_to_etensor: etensor pointer cannot be nullptr"); + + // Check storage_offset is 0 (ETensor does not support storage offset) + ET_CHECK_OR_RETURN_ERROR( + slim_tensor->storage_offset() == 0, + InvalidArgument, + "copy_slimtensor_to_etensor: SlimTensor storage_offset must be 0, got %ld", + static_cast(slim_tensor->storage_offset())); + + // Check that SlimTensor is contiguous + ET_CHECK_OR_RETURN_ERROR( + slim_tensor->is_contiguous(), + InvalidArgument, + "copy_slimtensor_to_etensor: SlimTensor must be contiguous"); + + // Check dtype matches + slim::c10::ScalarType slim_dtype = slim_tensor->dtype(); + executorch::runtime::etensor::ScalarType etensor_dtype = etensor->scalar_type(); + ET_CHECK_OR_RETURN_ERROR( + static_cast(slim_dtype) == static_cast(etensor_dtype), + InvalidArgument, + "copy_slimtensor_to_etensor: dtype mismatch, SlimTensor dtype %d != ETensor dtype %d", + static_cast(slim_dtype), + static_cast(etensor_dtype)); + + // Check dimensions match + ET_CHECK_OR_RETURN_ERROR( + static_cast(slim_tensor->dim()) == etensor->dim(), + InvalidArgument, + "copy_slimtensor_to_etensor: dimension mismatch, SlimTensor dim %zu != ETensor dim %zd", + slim_tensor->dim(), + etensor->dim()); + + // Convert sizes from int64_t to SizesType (int32_t) for resize + const size_t ndim = slim_tensor->dim(); + std::vector new_sizes( + ndim); + auto slim_sizes = slim_tensor->sizes(); + for (size_t i = 0; i < ndim; ++i) { + new_sizes[i] = static_cast< + executorch::runtime::etensor::TensorImpl::SizesType>(slim_sizes[i]); + } + + // Resize ETensor to match SlimTensor sizes + Error resize_err = executorch::ET_RUNTIME_NAMESPACE::resize_tensor( + *etensor, + executorch::runtime::ArrayRef< + executorch::runtime::etensor::TensorImpl::SizesType>( + new_sizes.data(), new_sizes.size())); + ET_CHECK_OK_OR_RETURN_ERROR( + resize_err, "copy_slimtensor_to_etensor: failed to resize ETensor"); + + // Copy data from SlimTensor to ETensor + // SlimTensor may be on GPU or CPU, ETensor is always on CPU + size_t nbytes = slim_tensor->nbytes(); + if (nbytes > 0) { + void* dst_data = etensor->mutable_data_ptr(); + const void* src_data = slim_tensor->data_ptr(); + + if (slim_tensor->is_cpu()) { + // CPU → CPU copy + std::memcpy(dst_data, src_data, nbytes); + } else { + // GPU → CPU copy + DeviceTraits::memcpy( + dst_data, src_data, nbytes, CPU_DEVICE, slim_tensor->device()); + } + } + + return Error::Ok; } +} // anonymous namespace class ET_EXPERIMENTAL CudaBackend final : public ::executorch::runtime::BackendInterface { @@ -285,87 +399,76 @@ class ET_EXPERIMENTAL CudaBackend final n_outputs, args.size()) - // NOTE: ExecuTorch tensors are always on CPU/host memory - // We need to create GPU copies for CUDA kernel execution - std::vector gpu_inputs( - n_inputs); // GPU copies for kernel execution - std::vector gpu_outputs( - n_outputs); // GPU tensors for kernel output + // NOTE: ExecuTorch tensors maybe on CPU or GPU due to the skip-copy optimization + // We need to create GPU copies for CUDA kernel execution using SlimTensor + std::vector gpu_input_tensors(n_inputs); + std::vector gpu_inputs(n_inputs); + std::vector gpu_output_tensors(n_outputs); + std::vector gpu_outputs(n_outputs); - // Process input tensors: ExecuTorch provides CPU tensors, create GPU - // copies + // Process input tensors: convert ETensor (CPU) to SlimTensor (GPU) for (size_t i = 0; i < n_inputs; i++) { - // Get tensor dimensions and properties from ExecuTorch CPU tensor - auto cpu_tensor = &(args[i]->toTensor()); - auto sizes = cpu_tensor->sizes(); - auto scalar_type = cpu_tensor->scalar_type(); - - // Create GPU tensor with same shape - std::vector sizes_vec(sizes.begin(), sizes.end()); + auto* cpu_tensor = &(args[i]->toTensor()); + + // Check if input data is already on GPU (skip-copy optimization for inputs) + // This can happen when the caller has pre-staged data on GPU + cudaPointerAttributes attributes{}; + const void* data_ptr = cpu_tensor->const_data_ptr(); + if (data_ptr != nullptr) { + cudaError_t err = cudaPointerGetAttributes(&attributes, data_ptr); + if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) { + // Data is already on GPU - wrap it directly without copy + auto sizes = cpu_tensor->sizes(); + auto strides = cpu_tensor->strides(); + std::vector sizes_vec(sizes.begin(), sizes.end()); + std::vector strides_vec(strides.begin(), strides.end()); + + gpu_input_tensors[i] = slim::from_blob( + const_cast(data_ptr), + slim::makeArrayRef(sizes_vec), + slim::makeArrayRef(strides_vec), + static_cast(cpu_tensor->scalar_type()), + DEFAULT_CUDA_DEVICE, + 0 // storage_offset + ); + gpu_inputs[i] = &gpu_input_tensors[i]; + continue; + } + } - AOTITensorHandle gpu_input_handle; - Error create_err = aoti_torch_empty_strided( - sizes_vec.size(), - sizes_vec.data(), - nullptr, // use default strides - static_cast(scalar_type), - 1, // device_type = cuda - 0, // device_index = 0 - &gpu_input_handle); - - ET_CHECK_OR_RETURN_ERROR( - create_err == Error::Ok, - Internal, - "Failed to create GPU tensor for input %d", - i); - - gpu_inputs[i] = gpu_input_handle; - - // Copy data from CPU to GPU - ET_CHECK_OR_RETURN_ERROR( - aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok, - Internal, - "Failed to copy input %d from CPU to GPU", - i); + // Data is on CPU - use from_etensor to copy to GPU + gpu_input_tensors[i] = + from_etensor(*cpu_tensor, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + gpu_inputs[i] = &gpu_input_tensors[i]; } - // Process output tensors: create GPU counterparts for ExecuTorch CPU - // tensors + + // Process output tensors: create GPU SlimTensors for kernel output for (size_t i = 0; i < n_outputs; i++) { - // Get output tensor dimensions from ExecuTorch CPU tensor - auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor()); auto sizes = cpu_output_tensor->sizes(); + auto strides = cpu_output_tensor->strides(); auto scalar_type = cpu_output_tensor->scalar_type(); - // Create GPU tensor with same shape for kernel output std::vector sizes_vec(sizes.begin(), sizes.end()); - - AOTITensorHandle gpu_output_handle; - Error create_err = aoti_torch_empty_strided( - sizes_vec.size(), - sizes_vec.data(), - nullptr, // use default strides - static_cast(scalar_type), - 1, // device_type = cuda - 0, // device_index = 0 - &gpu_output_handle); - - ET_CHECK_OR_RETURN_ERROR( - create_err == Error::Ok, - Internal, - "Failed to create GPU tensor for output %d", - i); - - gpu_outputs[i] = gpu_output_handle; + std::vector strides_vec(strides.begin(), strides.end()); + + gpu_output_tensors[i] = slim::empty_strided( + slim::makeArrayRef(sizes_vec), + slim::makeArrayRef(strides_vec), + static_cast(scalar_type), + DEFAULT_CUDA_DEVICE); + gpu_outputs[i] = &gpu_output_tensors[i]; } - // Run AOTI container with GPU tensors + + // Run AOTI container with GPU SlimTensors AOTIRuntimeError error = handle->run( handle->container_handle, - gpu_inputs.data(), // Use GPU input tensors + reinterpret_cast(gpu_inputs.data()), n_inputs, - gpu_outputs.data(), // Use GPU output tensors + reinterpret_cast(gpu_outputs.data()), n_outputs, - handle->cuda_stream, // Pass the actual CUDA stream - nullptr); // proxy_executor_handle can remain nullptr + handle->cuda_stream, + nullptr); ET_CHECK_OR_RETURN_ERROR( error == Error::Ok, @@ -376,22 +479,53 @@ class ET_EXPERIMENTAL CudaBackend final const bool copy_outputs = !should_skip_copy_for_method(handle->method_name); if (copy_outputs) { - // Copy GPU output results back to CPU output tensors + // Copy GPU SlimTensor results back to CPU ETensors for (size_t i = 0; i < n_outputs; i++) { - auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); - // For DYNAMIC_BOUND tensors we try to resize + auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor()); ET_CHECK_OK_OR_RETURN_ERROR( - resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()), - "Error resizing tensor at output index %d", - i); - ET_CHECK_OK_OR_RETURN_ERROR( - aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0), - "Failed to copy GPU output %d back to CPU", + copy_slimtensor_to_etensor(gpu_outputs[i], cpu_output_tensor), + "Failed to copy GPU output %zu back to CPU ETensor", i); } } else { - for (size_t i = 0; i < n_outputs; i++) { - args[i + n_inputs]->toTensor() = *gpu_outputs[i]; + // Skip-copy optimization: wrap GPU data as ETensor using from_blob + // The caller is responsible for handling GPU data directly + { + std::lock_guard guard(cached_outputs_mutex_); + auto& cached_outputs = cached_outputs_[handle]; + + // Clear cached outputs for previous round + cached_outputs.clear(); + for (size_t i = 0; i < n_outputs; i++) { + // Move output SlimTensors to cached_outputs for lifetime management + cached_outputs.push_back(std::move(gpu_output_tensors[i])); + + // Create an ETensor wrapper pointing to the GPU data + // The data stays on GPU and the caller handles it + SlimTensor& cached = cached_outputs.back(); + auto slim_sizes = cached.sizes(); + auto slim_strides = cached.strides(); + + std::vector et_sizes(cached.dim()); + std::vector et_strides(cached.dim()); + for (size_t d = 0; d < cached.dim(); d++) { + et_sizes[d] = + static_cast(slim_sizes[d]); + et_strides[d] = + static_cast(slim_strides[d]); + } + + // Use tensor_ptr_maker to create a non-owning ETensor wrapper + // Note: This creates a view into the SlimTensor's GPU memory + auto tensor_ptr = executorch::extension::from_blob( + cached.data_ptr(), + std::move(et_sizes), + std::move(et_strides), + static_cast(cached.dtype())); + + // Assign the wrapped tensor to the output EValue + args[i + n_inputs]->toTensor() = *tensor_ptr; + } } } @@ -424,9 +558,12 @@ class ET_EXPERIMENTAL CudaBackend final // AOTInductorModelContainerDelete(handle->container_handle); // Now close the shared library - auto err = Error::Ok; if (handle->so_handle != nullptr) { - err = close_library(handle->so_handle); + Error err = close_library(handle->so_handle); + ET_CHECK_OR_LOG_ERROR( + err == Error::Ok, + "Failed to close shared library for %s", + handle->so_path.c_str()); } // Remove the temporary shared library file @@ -441,12 +578,19 @@ class ET_EXPERIMENTAL CudaBackend final } delete handle; - clear_all_tensors(); } private: mutable std::mutex skip_copy_method_mutex_; std::string skip_copy_method_; + + // Cached output tensors for skip-copy optimization. + // When copy-skip is enabled, output SlimTensors are cached here to keep + // GPU memory alive while the caller processes the results. + // Maps from AOTIDelegateHandle* to its cached outputs. + mutable std::mutex cached_outputs_mutex_; + mutable std::unordered_map> + cached_outputs_; }; } // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/guard.h b/backends/cuda/runtime/guard.h index 3f187000f90..2f0fb8f7546 100644 --- a/backends/cuda/runtime/guard.h +++ b/backends/cuda/runtime/guard.h @@ -19,8 +19,8 @@ namespace executorch::backends::cuda { using executorch::runtime::Error; using executorch::runtime::Result; -// Type alias for device index -using DeviceIndex = int32_t; +// Signed device index type matching DeviceIndex in slim tensor library +using DeviceIndex = int8_t; /** * Set the current CUDA stream for the specified device. diff --git a/backends/cuda/runtime/shims/int4mm.cuh b/backends/cuda/runtime/shims/int4mm.cuh index ee12fb51004..8ee3fcb957e 100644 --- a/backends/cuda/runtime/shims/int4mm.cuh +++ b/backends/cuda/runtime/shims/int4mm.cuh @@ -1177,13 +1177,14 @@ Tensor* _weight_int4pack_mm_cuda( ET_CHECK(B_innerKTiles == 2 || B_innerKTiles == 4 || B_innerKTiles == 8); // A is standard row major - ET_CHECK(A.dtype() == executorch::aten::ScalarType::BFloat16); + // SlimTensor::dtype() returns slim::c10::ScalarType, cast to int32_t for comparison + ET_CHECK(static_cast(A.dtype()) == static_cast(SupportedDTypes::BFLOAT16)); // ET only supports contiguous tensors for now // ET_CHECK(A.is_contiguous()); ET_CHECK(A.dim() == 2); // B has B_innerKTiles k-tiles in the innermost dimension - ET_CHECK(B.dtype() == executorch::aten::ScalarType::Int); + ET_CHECK(static_cast(B.dtype()) == static_cast(SupportedDTypes::INT32)); // ET only supports contiguous tensors for now // ET_CHECK(B.is_contiguous()); ET_CHECK(B.dim() == 4); diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp index 86f6cdd6396..c10cbc3ad7f 100644 --- a/backends/cuda/runtime/shims/memory.cpp +++ b/backends/cuda/runtime/shims/memory.cpp @@ -6,104 +6,26 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -namespace executorch::backends::cuda { - -using executorch::aten::SizesType; -using executorch::aten::StridesType; -using executorch::backends::aoti::aoti_torch_dtype_bool; -using executorch::backends::aoti::aoti_torch_get_device_index; -using executorch::backends::aoti::aoti_torch_get_dtype; -using executorch::backends::aoti::aoti_torch_get_sizes; -using executorch::backends::aoti::aoti_torch_get_strides; -using executorch::backends::aoti::convert_sizes_to_vector; -using executorch::backends::aoti::convert_strides_to_vector; -using executorch::backends::aoti::dtype_to_element_size; -using executorch::backends::aoti::dtype_to_scalar_type; -using executorch::backends::aoti::validate_storage_offset; - -// Global storage for tensors and their metadata -std::unordered_set> tensors; - -// Reference counting for memory addresses -// Maps memory address to number of tensors using it -// Special value: NOT_OWN (-1) means tensor never owns the memory -constexpr int32_t NOT_OWN = -1; -std::unordered_map memory_to_n_tensor; - -namespace { - -// Calculate linear offset from strides and indices -int64_t calculate_linear_offset( - const int64_t* indices, - const int64_t* strides, - int64_t ndim) { - int64_t offset = 0; - for (int64_t i = 0; i < ndim; ++i) { - offset += indices[i] * strides[i]; - } - return offset; -} - -// Convert linear index to multi-dimensional indices based on sizes -void linear_to_indices( - int64_t linear_idx, - const int64_t* sizes, - int64_t ndim, - int64_t* indices) { - for (int64_t i = ndim - 1; i >= 0; --i) { - indices[i] = linear_idx % sizes[i]; - linear_idx /= sizes[i]; - } -} +#include +#include +#include +#include -// Generic pointwise copy function that handles arbitrary strides -template -AOTITorchError pointwise_copy_generic( - T* dst_data, - const T* src_data, - const int64_t* dst_sizes, - const int64_t* dst_strides, - const int64_t* src_sizes, - const int64_t* src_strides, - int64_t dst_ndim, - int64_t src_ndim, - int64_t total_elements) { - std::vector dst_indices(dst_ndim); - std::vector src_indices(src_ndim); - - for (int64_t linear_idx = 0; linear_idx < total_elements; ++linear_idx) { - // Convert linear index to multi-dimensional indices for both tensors - linear_to_indices(linear_idx, dst_sizes, dst_ndim, dst_indices.data()); - linear_to_indices(linear_idx, src_sizes, src_ndim, src_indices.data()); - - // Calculate offsets for both source and destination - int64_t src_offset = - calculate_linear_offset(src_indices.data(), src_strides, src_ndim); - int64_t dst_offset = - calculate_linear_offset(dst_indices.data(), dst_strides, dst_ndim); - - // Copy element - dst_data[dst_offset] = src_data[src_offset]; - } +namespace executorch::backends::cuda { - return Error::Ok; -} +namespace c10 = executorch::backends::aoti::slim::c10; +using c10::Device; +using c10::DeviceIndex; +using c10::DeviceType; +using c10::ScalarType; +using executorch::backends::aoti::slim::empty_strided; +using executorch::backends::aoti::slim::from_blob; +using executorch::backends::aoti::slim::IntArrayRef; -} // anonymous namespace +// Use SlimTensor directly to avoid naming conflicts with ETensor +using SlimTensor = executorch::backends::aoti::slim::SlimTensor; extern "C" { @@ -116,109 +38,43 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( int32_t dtype, int32_t device_type, int32_t device_index, - Tensor** ret_new_tensor, + SlimTensor** ret_new_tensor, int32_t layout, const uint8_t* opaque_metadata, int64_t opaque_metadata_size) { - (void)opaque_metadata; + // Unused parameters (void)layout; + (void)opaque_metadata; (void)opaque_metadata_size; - // Validate input parameters first ET_CHECK_OR_RETURN_ERROR( data != nullptr, InvalidArgument, - "aoti_torch_create_tensor_from_blob_v2 failed: data pointer is null"); - - ET_CHECK_OR_RETURN_ERROR( - !(sizes_ptr == nullptr && ndim > 0), - InvalidArgument, - "aoti_torch_create_tensor_from_blob_v2 failed: sizes_ptr is null"); + "aoti_torch_create_tensor_from_blob_v2: data is null"); ET_CHECK_OR_RETURN_ERROR( ret_new_tensor != nullptr, InvalidArgument, - "aoti_torch_create_tensor_from_blob_v2 failed: ret_new_tensor is null"); - - // Check that device_index is always 0 - ET_CHECK_OR_RETURN_ERROR( - device_index == 0, - InvalidArgument, - "device_index must be 0, got: %d", - device_index); - - // Validate dtype using SupportedDTypes from utils.h - ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype)); - - // Storage offset must be 0 since from_blob cannot handle different offsets - ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset)); - - // Verify that data pointer location matches the requested device_type - cudaPointerAttributes data_attributes{}; - ET_CUDA_CHECK_OR_RETURN_ERROR( - cudaPointerGetAttributes(&data_attributes, data)); - - bool data_is_on_device = data_attributes.type == cudaMemoryTypeDevice; - bool data_is_on_host = data_attributes.type == cudaMemoryTypeHost || - data_attributes.type == cudaMemoryTypeUnregistered; - bool requested_device = - device_type == static_cast(SupportedDevices::CUDA); - bool requested_cpu = - device_type == static_cast(SupportedDevices::CPU); - - // Error if data location doesn't match requested device type - ET_CHECK_OR_RETURN_ERROR( - !(data_is_on_device && requested_cpu), - InvalidArgument, - "aoti_torch_create_tensor_from_blob_v2 failed: data pointer %p is on CUDA " - "but device_type is CPU. Data must be on CPU for CPU tensors.", - data); + "aoti_torch_create_tensor_from_blob_v2: ret_new_tensor is null"); ET_CHECK_OR_RETURN_ERROR( - !(data_is_on_host && requested_device), + !(sizes_ptr == nullptr && ndim > 0), InvalidArgument, - "aoti_torch_create_tensor_from_blob_v2 failed: data pointer %p is on CPU " - "but device_type is CUDA. Data must be on GPU for CUDA tensors.", - data); - - // Convert sizes to the format expected by ExecutorTorch using SizesType - std::vector sizes = - convert_sizes_to_vector(ndim, sizes_ptr); - - // Convert strides using the common helper function with StridesType - std::vector strides = - convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); - - // Create ExecutorTorch tensor that wraps the existing memory - // Note: We're NOT copying the data, just wrapping it - // Using CUDA-specific tensor maker that supports incontiguous tensors - auto tensor = make_tensor( - sizes, // tensor dimensions - data, // existing memory (don't copy!) - {}, // dim_order (empty, will be auto-generated) - strides, // tensor strides (allows different strides) - dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType - ); - - ET_CHECK_OR_RETURN_ERROR( - tensor != nullptr, InvalidArgument, "Failed to create tensor from blob"); + "aoti_torch_create_tensor_from_blob_v2: sizes_ptr is null but ndim > 0"); - // Store the tensor so it doesn't get destroyed - tensors.insert(tensor); - - *ret_new_tensor = tensor.get(); - - // Check if this memory address is already being tracked - auto memory_it = memory_to_n_tensor.find(data); - ET_CHECK_OR_RETURN_ERROR( - memory_it == memory_to_n_tensor.end(), - InvalidArgument, - "Memory address %p is already being tracked by another tensor", - data); + IntArrayRef sizes(sizes_ptr, static_cast(ndim)); + IntArrayRef strides(strides_ptr, static_cast(ndim)); - // Mark this memory as NOT_OWN since tensor created from blob never owns - // memory - memory_to_n_tensor[data] = NOT_OWN; + // Create the SlimTensor using from_blob (non-owning) + *ret_new_tensor = new SlimTensor(from_blob( + data, + sizes, + strides, + static_cast(dtype), + Device( + static_cast(device_type), + static_cast(device_index)), + storage_offset)); return Error::Ok; } @@ -230,697 +86,177 @@ AOTITorchError aoti_torch_empty_strided( int32_t dtype, int32_t device_type, int32_t device_index, - Tensor** ret_new_tensor) { - // Check that device_index is always 0 + SlimTensor** ret_new_tensor) { ET_CHECK_OR_RETURN_ERROR( - device_index == 0, + ret_new_tensor != nullptr, InvalidArgument, - "device_index must be 0, got: %d", - device_index); - - // This requires us to reserve CUDA memory and put it into a ETensor - void* ptr; + "aoti_torch_empty_strided: ret_new_tensor is null"); - ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype)); - - size_t element_size = dtype_to_element_size(dtype); ET_CHECK_OR_RETURN_ERROR( - element_size != 0, + !(sizes_ptr == nullptr && ndim > 0), InvalidArgument, - "Invalid element size for dtype: %d", - dtype); - - // Calculate storage size based on strides, matching PyTorch's behavior - // This is critical when sizes and strides don't match the expected contiguous - // layout Reference: PyTorch's computeStorageNbytes in EmptyTensor.cpp - int64_t storage_size = 1; // storage offset (0) + 1 - for (int64_t i = 0; i < ndim; i++) { - if (sizes_ptr[i] == 0) { - storage_size = 0; - break; - } - // For each dimension, add stride[i] * (size[i] - 1) - // This gives us the maximum offset in that dimension - int64_t stride_i = (strides_ptr != nullptr) ? strides_ptr[i] : 1; - if (strides_ptr == nullptr) { - // Calculate contiguous stride if not provided - for (int64_t j = i + 1; j < ndim; j++) { - stride_i *= sizes_ptr[j]; - } - } - storage_size += stride_i * (sizes_ptr[i] - 1); - } - int64_t nbytes = storage_size * element_size; - - if (device_type == static_cast(SupportedDevices::CUDA)) { - ET_CUDA_CHECK_OR_RETURN_ERROR( - cudaMallocAsync(&ptr, static_cast(nbytes), cudaStreamDefault)); - } else if (device_type == static_cast(SupportedDevices::CPU)) { - // Ensure 16-byte alignment for CPU memory to match CUDA requirements - ptr = aligned_alloc(16, nbytes); - ET_CHECK_OR_RETURN_ERROR( - ptr != nullptr, - MemoryAllocationFailed, - "Failed to allocate aligned CPU memory"); - } else { - ET_CHECK_OR_RETURN_ERROR( - false, - NotImplemented, - "Need to implement empty_strided for non-CUDA non-CPU device type %d", - device_type); - } - - // ETensor sizes - auto sizes = convert_sizes_to_vector(ndim, sizes_ptr); - - // ETensor strides - auto strides = convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); - - // ETensor creation with dynamic shape support for edge cases - // Using CUDA-specific tensor maker that supports incontiguous tensors - auto tensor = make_tensor( + "aoti_torch_empty_strided: sizes_ptr is null but ndim > 0"); + + IntArrayRef sizes(sizes_ptr, static_cast(ndim)); + IntArrayRef strides(strides_ptr, static_cast(ndim)); + + // Create the SlimTensor using empty_strided (owning) + *ret_new_tensor = new SlimTensor(empty_strided( sizes, - ptr, - {}, // dim_order (empty, will be auto-generated) strides, - dtype_to_scalar_type(dtype)); + static_cast(dtype), + Device( + static_cast(device_type), + static_cast(device_index)))); - // Store the tensor so it doesn't get destroyed - tensors.insert(tensor); - *ret_new_tensor = tensor.get(); - - // This tensor owns the memory it allocated, set reference count to 1 - memory_to_n_tensor[ptr] = 1; return Error::Ok; } -void clear_all_tensors() { - // Use aoti_torch_delete_tensor_object to properly delete each tensor - // Note: We need to collect tensor pointers first since deletion modifies the - // set - std::vector tensor_ptrs; - tensor_ptrs.reserve(tensors.size()); - for (const auto& tensor_shared : tensors) { - tensor_ptrs.push_back(tensor_shared.get()); - } - - // Now delete each tensor - this will modify the global tensors set - for (Tensor* tensor_ptr : tensor_ptrs) { - aoti_torch_delete_tensor_object(tensor_ptr); - } - - // tensors set should now be empty, but ensure it's cleared - tensors.clear(); - - // Clear memory tracking map (includes leftover NOT_OWN entries) - memory_to_n_tensor.clear(); - - ET_LOG(Info, "Cleared all tensors and memory tracking"); -} - -AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) { - // Handle null tensor pointer - ET_CHECK_OR_RETURN_ERROR( - tensor != nullptr, InvalidArgument, "Cannot delete null tensor"); - - // Check if tensor exists in our tracking - bool found_in_tensors = false; - for (auto it = tensors.begin(); it != tensors.end(); ++it) { - if (it->get() == tensor) { - found_in_tensors = true; - break; - } - } - - // If tensor not found in our tracking, it's invalid +AOTITorchError aoti_torch_delete_tensor_object(SlimTensor* tensor) { ET_CHECK_OR_RETURN_ERROR( - found_in_tensors, InvalidArgument, "Didn't find tensor %p", tensor); - - // Find and delete the tensor - for (auto it = tensors.begin(); it != tensors.end(); ++it) { - if (it->get() == tensor) { - // Get the tensor before erasing - auto tensor_ptr = *it; - void* data_ptr = tensor_ptr->mutable_data_ptr(); - - // Find the reference count for this memory address - auto memory_it = memory_to_n_tensor.find(data_ptr); - if (memory_it != memory_to_n_tensor.end()) { - int32_t ref_count = memory_it->second; - - if (ref_count == NOT_OWN) { - // Tensor never owned the memory, skip freeing - // Just remove tensor from tracking - tensors.erase(it); - return Error::Ok; - } else if (ref_count == 1) { - // Only current tensor using this memory, free it - // Determine if it's GPU memory - cudaPointerAttributes attributes{}; - ET_CUDA_CHECK_OR_RETURN_ERROR( - cudaPointerGetAttributes(&attributes, data_ptr)); - - if (attributes.type == cudaMemoryTypeDevice) { - ET_CUDA_CHECK_OR_RETURN_ERROR( - cudaFreeAsync(data_ptr, cudaStreamDefault)); - } else { - ET_CHECK_OR_RETURN_ERROR( - attributes.type != cudaMemoryTypeManaged, - Internal, - "Expected host memory but got managed!") - // This is CPU memory - free immediately - aligned_free(data_ptr); - data_ptr = nullptr; - } - - // Remove from memory tracking - memory_to_n_tensor.erase(memory_it); - } else if (ref_count > 1) { - // Other tensors still using this memory, just decrement count - memory_to_n_tensor[data_ptr] = ref_count - 1; - } - } else { - ET_CHECK_OR_RETURN_ERROR( - false, - Internal, - "Internal error: memory not found during deletion"); - } - - // Remove tensor from set (this will call the destructor if it's the last - // reference) - tensors.erase(it); - return Error::Ok; - } - } - - // This should never be reached since we found it above - ET_CHECK_OR_RETURN_ERROR( - false, Internal, "Internal error: tensor not found after validation"); -} - -AOTITorchError -aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) { - (void)non_blocking; - - // Check for null pointers first - ET_CHECK_OR_RETURN_ERROR( - self != nullptr, - InvalidArgument, - "aoti_torch_copy_ failed: self tensor is null"); - - ET_CHECK_OR_RETURN_ERROR( - src != nullptr, + tensor != nullptr, InvalidArgument, - "aoti_torch_copy_ failed: src tensor is null"); + "aoti_torch_delete_tensor_object: tensor is null"); - // Get dtype information and validate compatibility - int32_t self_dtype, src_dtype; - aoti_torch_get_dtype(self, &self_dtype); - aoti_torch_get_dtype(src, &src_dtype); + // SlimTensor uses SharedPtr for storage, so simply deleting the tensor + // will automatically handle reference counting and free the underlying + // storage when no more references exist. + delete tensor; - ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(self_dtype)); - - ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(src_dtype)); + return Error::Ok; +} - // Check dtype compatibility - both tensors must have the same dtype +AOTITorchError aoti_torch_new_tensor_handle( + SlimTensor* orig_handle, + SlimTensor** new_handle) { ET_CHECK_OR_RETURN_ERROR( - self_dtype == src_dtype, + orig_handle != nullptr, InvalidArgument, - "dtype mismatch. self.dtype=%d, src.dtype=%d. aoti_torch_copy_ requires same dtypes", - self_dtype, - src_dtype); - - // Check total number of elements compatibility (PyTorch copy_ behavior) - int64_t self_numel = self->numel(); - int64_t src_numel = src->numel(); + "aoti_torch_new_tensor_handle: orig_handle is null"); ET_CHECK_OR_RETURN_ERROR( - self_numel == src_numel, + new_handle != nullptr, InvalidArgument, - "numel mismatch. self.numel()=%ld, src.numel()=%ld", - self_numel, - src_numel); - - // Get tensor metadata - int64_t* self_strides; - int64_t* src_strides; - aoti_torch_get_strides(self, &self_strides); - aoti_torch_get_strides(src, &src_strides); - - int64_t* self_sizes; - int64_t* src_sizes; - aoti_torch_get_sizes(self, &self_sizes); - aoti_torch_get_sizes(src, &src_sizes); - - // Determine device locations - cudaPointerAttributes srcAttributes{}; - cudaPointerAttributes dstAttributes{}; - - ET_CUDA_CHECK_OR_RETURN_ERROR( - cudaPointerGetAttributes(&srcAttributes, src->data_ptr())); - - ET_CUDA_CHECK_OR_RETURN_ERROR( - cudaPointerGetAttributes(&dstAttributes, self->data_ptr())); - - bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice; - bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice; - - // Check if tensors have the same schema (sizes, strides, dtype) for fast path - bool same_schema = true; - for (int i = 0; i < self->dim(); i++) { - if (self_strides[i] != src_strides[i]) { - same_schema = false; - break; - } - } - - size_t total_bytes = src->nbytes(); - int64_t total_elements = self->numel(); - - if (same_schema) { - // Fast path: Direct memory copy since layouts match exactly - if (srcIsDevice && dstIsDevice) { - ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy( - self->mutable_data_ptr(), - src->data_ptr(), - total_bytes, - cudaMemcpyDeviceToDevice)); - } else if (srcIsDevice && !dstIsDevice) { - ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy( - self->mutable_data_ptr(), - src->data_ptr(), - total_bytes, - cudaMemcpyDeviceToHost)); - } else if (!srcIsDevice && dstIsDevice) { - ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy( - self->mutable_data_ptr(), - src->data_ptr(), - total_bytes, - cudaMemcpyHostToDevice)); - } else { - std::memcpy(self->mutable_data_ptr(), src->data_ptr(), total_bytes); - } - } else { - // Fallback path: Pointwise copy with stride-aware indexing - // This handles arbitrary tensor layouts and strides - - size_t element_size = dtype_to_element_size(self_dtype); - ET_CHECK_OR_RETURN_ERROR( - element_size != 0, - InvalidArgument, - "Invalid element size for dtype: %d", - self_dtype); - - // Allocate temporary host memory for GPU tensors - float* src_host_data = nullptr; - float* dst_host_data = nullptr; - bool need_free_src = false; - bool need_free_dst = false; - - if (srcIsDevice) { - src_host_data = - static_cast(malloc(total_elements * sizeof(float))); - ET_CHECK_OR_RETURN_ERROR( - src_host_data != nullptr, - MemoryAllocationFailed, - "Failed to allocate memory for src_host_data"); - ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy( - src_host_data, src->data_ptr(), total_bytes, cudaMemcpyDeviceToHost)); - need_free_src = true; - } else { - src_host_data = static_cast(src->data_ptr()); - } - - if (dstIsDevice) { - dst_host_data = - static_cast(malloc(total_elements * sizeof(float))); - if (dst_host_data == nullptr) { - if (need_free_src) { - free(src_host_data); - } - ET_CHECK_OR_RETURN_ERROR( - false, - MemoryAllocationFailed, - "Failed to allocate memory for dst_host_data"); - } - need_free_dst = true; - } else { - dst_host_data = static_cast(self->mutable_data_ptr()); - } - - // Perform pointwise copy with stride calculation - AOTITorchError copy_err = pointwise_copy_generic( - dst_host_data, - src_host_data, - self_sizes, - self_strides, - src_sizes, - src_strides, - self->dim(), - src->dim(), - total_elements); - - if (copy_err != Error::Ok) { - // Clean up temporary buffers before returning - if (need_free_src) { - free(src_host_data); - } - if (need_free_dst) { - free(dst_host_data); - } - return copy_err; - } - - // Copy result back to device if needed - if (dstIsDevice) { - ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy( - self->mutable_data_ptr(), - dst_host_data, - total_bytes, - cudaMemcpyHostToDevice)); - } - - // Clean up temporary buffers - if (need_free_src) { - free(src_host_data); - } - if (need_free_dst) { - free(dst_host_data); - } - } + "aoti_torch_new_tensor_handle: new_handle is null"); + + // Create a new SlimTensor that shares the same underlying storage. + // SlimTensor's copy constructor shares the SharedPtr, so both + // tensors will reference the same memory. When the last tensor is deleted, + // the storage will be freed. + *new_handle = new SlimTensor(*orig_handle); return Error::Ok; } AOTITorchError aoti_torch__reinterpret_tensor( - Tensor* self, + SlimTensor* self, int64_t ndim, const int64_t* sizes_ptr, const int64_t* strides_ptr, int64_t storage_offset, - Tensor** ret_new_tensor) { - // Validate input parameters first + SlimTensor** ret_new_tensor) { ET_CHECK_OR_RETURN_ERROR( self != nullptr, InvalidArgument, - "aoti_torch__reinterpret_tensor failed: self tensor is null"); - - ET_CHECK_OR_RETURN_ERROR( - !(sizes_ptr == nullptr && ndim > 0), - InvalidArgument, - "aoti_torch__reinterpret_tensor failed: sizes_ptr is null"); + "aoti_torch__reinterpret_tensor: self is null"); ET_CHECK_OR_RETURN_ERROR( ret_new_tensor != nullptr, InvalidArgument, - "aoti_torch__reinterpret_tensor failed: ret_new_tensor is null"); - - // Check if storage_offset is not 0 - return error if not - ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset)); - - // Get the device info from the source tensor to perform device_index - // validation - int32_t device_type = 0; - int32_t device_index = 0; - ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_type(self, &device_type)); - - ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_index(self, &device_index)); + "aoti_torch__reinterpret_tensor: ret_new_tensor is null"); - // Ensure device_index is always 0 ET_CHECK_OR_RETURN_ERROR( - device_index == 0, + ndim >= 0, InvalidArgument, - "device_index must be 0, got: %d", - device_index); + "aoti_torch__reinterpret_tensor: ndim must be non-negative, got %lld", + static_cast(ndim)); - // Get the dtype from the source tensor - int32_t dtype = 0; - ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(self, &dtype)); - - // Validate dtype using SupportedDTypes - ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype)); - - // Get the original data pointer from the source tensor - void* data_ptr = self->mutable_data_ptr(); ET_CHECK_OR_RETURN_ERROR( - data_ptr != nullptr, - InvalidArgument, - "Source tensor has null data pointer"); - - // Check if the given memory is in the map, if not return error - auto memory_it = memory_to_n_tensor.find(data_ptr); - ET_CHECK_OR_RETURN_ERROR( - memory_it != memory_to_n_tensor.end(), - InvalidArgument, - "Memory address %p is not being tracked by reference counting system", - data_ptr); - - // Convert sizes using utility function from utils.h - std::vector sizes = convert_sizes_to_vector(ndim, sizes_ptr); - - // Convert strides using utility function from utils.h - std::vector strides = - convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); - - // Create new tensor view that reinterprets the same memory with different - // shape/strides This creates a view, not a copy - the data pointer is shared - // Using CUDA-specific tensor maker that supports incontiguous tensors - std::shared_ptr tensor = make_tensor( - sizes, // New sizes with explicit SizesType - data_ptr, // Reuse the same memory from source tensor - {}, // dim_order (empty, will be auto-generated) - strides, // New strides with explicit StridesType - dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting - ); - - ET_CHECK_OR_RETURN_ERROR( - tensor != nullptr, + !(sizes_ptr == nullptr && ndim > 0), InvalidArgument, - "Failed to create reinterpreted tensor view"); + "aoti_torch__reinterpret_tensor: sizes_ptr is null but ndim > 0"); - // Store the tensor so it doesn't get destroyed - tensors.insert(tensor); + IntArrayRef sizes(sizes_ptr, static_cast(ndim)); + IntArrayRef strides(strides_ptr, static_cast(ndim)); - *ret_new_tensor = tensor.get(); - - // Increment the reference count for this memory address only if it is owned - // by tensor - memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN - ? NOT_OWN - : memory_to_n_tensor[data_ptr] + 1; + // Create a new tensor view using as_strided. This creates a tensor that + // shares the same underlying storage but with different sizes, strides, + // and storage offset. SlimTensor::as_strided() handles this via copy + // constructor which shares the SharedPtr. + *ret_new_tensor = + new SlimTensor(self->as_strided(sizes, strides, storage_offset)); return Error::Ok; } -AOTITorchError aoti_torch_new_tensor_handle( - Tensor* orig_handle, - Tensor** new_handle) { - // Validate input parameters - ET_CHECK_OR_RETURN_ERROR( - orig_handle != nullptr, - InvalidArgument, - "aoti_torch_new_tensor_handle failed: orig_handle is null"); - - ET_CHECK_OR_RETURN_ERROR( - new_handle != nullptr, - InvalidArgument, - "aoti_torch_new_tensor_handle failed: new_handle is null"); - - // Get metadata from the original tensor - int64_t* sizes_ptr; - int64_t* strides_ptr; - int32_t dtype; - int32_t device_type; - int32_t device_index; - - ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_sizes(orig_handle, &sizes_ptr)); - ET_CHECK_OK_OR_RETURN_ERROR( - aoti_torch_get_strides(orig_handle, &strides_ptr)); - ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(orig_handle, &dtype)); - ET_CHECK_OK_OR_RETURN_ERROR( - aoti_torch_get_device_type(orig_handle, &device_type)); - ET_CHECK_OK_OR_RETURN_ERROR( - aoti_torch_get_device_index(orig_handle, &device_index)); - - int64_t ndim = orig_handle->dim(); - - // Validate dtype - ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype)); - - // Ensure device_index is always 0 - ET_CHECK_OR_RETURN_ERROR( - device_index == 0, - InvalidArgument, - "device_index must be 0, got: %d", - device_index); - - // Get the original data pointer from the source tensor - void* data_ptr = orig_handle->mutable_data_ptr(); - ET_CHECK_OR_RETURN_ERROR( - data_ptr != nullptr, - InvalidArgument, - "Source tensor has null data pointer"); +AOTITorchError +aoti_torch_copy_(SlimTensor* self, SlimTensor* src, int32_t non_blocking) { + (void)non_blocking; // SlimTensor::copy_() is always synchronous for now - // Check if the given memory is in the map - auto memory_it = memory_to_n_tensor.find(data_ptr); ET_CHECK_OR_RETURN_ERROR( - memory_it != memory_to_n_tensor.end(), - InvalidArgument, - "Memory address %p is not being tracked by reference counting system", - data_ptr); - - // Convert sizes and strides to vectors - std::vector sizes = convert_sizes_to_vector(ndim, sizes_ptr); - std::vector strides = - convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); - - // Create new tensor that shares the same memory as the original - // This is similar to PyTorch's Tensor copy constructor - creates a new - // tensor object that shares the same underlying storage - std::shared_ptr tensor = make_tensor( - sizes, // Same sizes as original - data_ptr, // Share the same memory from source tensor - {}, // dim_order (empty, will be auto-generated) - strides, // Same strides as original - dtype_to_scalar_type(dtype) // Same dtype as original - ); + self != nullptr, InvalidArgument, "aoti_torch_copy_: self is null"); ET_CHECK_OR_RETURN_ERROR( - tensor != nullptr, InvalidArgument, "Failed to create new tensor handle"); - - // Store the tensor so it doesn't get destroyed - tensors.insert(tensor); + src != nullptr, InvalidArgument, "aoti_torch_copy_: src is null"); - *new_handle = tensor.get(); - - // Increment the reference count for this memory address only if it is owned - // by tensor - memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN - ? NOT_OWN - : memory_to_n_tensor[data_ptr] + 1; + // SlimTensor::copy_() handles: + // - Same numel validation + // - Same dtype validation + // - CPU-CPU, CPU-CUDA, CUDA-CPU, CUDA-CUDA copies + // - Contiguous fast path and non-contiguous element-wise copy + self->copy_(*src); return Error::Ok; } -AOTITorchError aoti_torch_item_bool(Tensor* tensor, bool* ret_value) { - // Validate input parameters +AOTITorchError aoti_torch_item_bool(SlimTensor* tensor, bool* ret_value) { ET_CHECK_OR_RETURN_ERROR( tensor != nullptr, InvalidArgument, - "aoti_torch_item_bool failed: tensor is null"); + "aoti_torch_item_bool: tensor is null"); ET_CHECK_OR_RETURN_ERROR( ret_value != nullptr, InvalidArgument, - "aoti_torch_item_bool failed: ret_value is null"); - - // Validate that tensor dtype is bool - int32_t dtype; - ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(tensor, &dtype)); + "aoti_torch_item_bool: ret_value is null"); ET_CHECK_OR_RETURN_ERROR( - dtype == aoti_torch_dtype_bool(), + tensor->numel() == 1, InvalidArgument, - "aoti_torch_item_bool failed: tensor dtype is not bool (got %d)", - dtype); + "aoti_torch_item_bool: tensor must have exactly 1 element, got %zu", + tensor->numel()); - // Get the data pointer - const void* data_ptr = tensor->const_data_ptr(); ET_CHECK_OR_RETURN_ERROR( - data_ptr != nullptr, + tensor->dtype() == ScalarType::Bool, InvalidArgument, - "aoti_torch_item_bool failed: tensor data pointer is null"); - - // Check if tensor is on CUDA or CPU - cudaPointerAttributes attributes{}; - ET_CUDA_CHECK_OR_RETURN_ERROR( - cudaPointerGetAttributes(&attributes, data_ptr)); - - if (attributes.type == cudaMemoryTypeDevice) { - // CUDA memory case: copy from device to host - bool device_value; - ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy( - &device_value, data_ptr, sizeof(bool), cudaMemcpyDeviceToHost)); - *ret_value = device_value; - } else { - // CPU memory case: direct access - const bool* bool_ptr = static_cast(data_ptr); - *ret_value = *bool_ptr; - } + "aoti_torch_item_bool: tensor dtype must be Bool"); + + // SlimTensor::item() handles both CPU and CUDA tensors. + // For CUDA tensors, it copies the value to CPU automatically. + *ret_value = tensor->item(); return Error::Ok; } -AOTITorchError aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst) { - // Validate input parameters +AOTITorchError aoti_torch_assign_tensors_out(SlimTensor* src, SlimTensor** ret_dst) { ET_CHECK_OR_RETURN_ERROR( src != nullptr, InvalidArgument, - "aoti_torch_assign_tensors_out failed: src is null"); + "aoti_torch_assign_tensors_out: src is null"); ET_CHECK_OR_RETURN_ERROR( ret_dst != nullptr, InvalidArgument, - "aoti_torch_assign_tensors_out failed: ret_dst is null"); - - // Get the data pointer from the source tensor - void* data_ptr = src->mutable_data_ptr(); - ET_CHECK_OR_RETURN_ERROR( - data_ptr != nullptr, - InvalidArgument, - "Source tensor has null data pointer"); + "aoti_torch_assign_tensors_out: ret_dst is null"); - // Check if the given memory is in the map, if not return error - auto memory_it = memory_to_n_tensor.find(data_ptr); - ET_CHECK_OR_RETURN_ERROR( - memory_it != memory_to_n_tensor.end(), - InvalidArgument, - "Memory address %p is not being tracked by reference counting system", - data_ptr); - - // Get dtype from source tensor - int32_t dtype = 0; - ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(src, &dtype)); - - // Get sizes and strides from source tensor - int64_t* sizes_ptr; - int64_t* strides_ptr; - ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_sizes(src, &sizes_ptr)); - ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_strides(src, &strides_ptr)); - - int64_t ndim = src->dim(); - - // Convert to vectors - std::vector sizes = convert_sizes_to_vector(ndim, sizes_ptr); - std::vector strides = - convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); - - // Create new tensor view that shares the same memory as source tensor - std::shared_ptr tensor = make_tensor( - sizes, - data_ptr, // Share the same memory from source tensor - {}, // dim_order (empty, will be auto-generated) - strides, - dtype_to_scalar_type(dtype)); - - ET_CHECK_OR_RETURN_ERROR( - tensor != nullptr, - InvalidArgument, - "Failed to create tensor view in aoti_torch_assign_tensors_out"); - - // Store the tensor so it doesn't get destroyed - tensors.insert(tensor); - - *ret_dst = tensor.get(); - - // Increment the reference count for this memory address only if it is owned - // by tensor - memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN - ? NOT_OWN - : memory_to_n_tensor[data_ptr] + 1; + // Move the source tensor into the destination. After this operation, + // the source tensor will be left in an undefined state (reset). + // This differs from aoti_torch_new_tensor_handle which copies the tensor. + *ret_dst = new SlimTensor(std::move(*src)); return Error::Ok; } + } // extern "C" } // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h index 34b781a5270..036fa5ec6c6 100644 --- a/backends/cuda/runtime/shims/memory.h +++ b/backends/cuda/runtime/shims/memory.h @@ -8,15 +8,20 @@ #pragma once -#include -#include -#include #include +#include +#include +#include +#include + namespace executorch::backends::cuda { -using executorch::backends::aoti::AOTITorchError; -using executorch::backends::aoti::Tensor; +using executorch::runtime::Error; +using AOTITorchError = Error; + +// Use SlimTensor directly in shim APIs to avoid naming conflicts with ETensor +using SlimTensor = executorch::backends::aoti::slim::SlimTensor; extern "C" { @@ -28,21 +33,17 @@ extern "C" { * * @param data Pointer to the memory blob to wrap (must not be null) * @param ndim Number of dimensions in the tensor - * @param sizes_ptr Pointer to array of dimension sizes (using SizesType) - * @param strides_ptr Pointer to array of strides for each dimension (using - * StridesType, can be null for contiguous) - * @param storage_offset Storage offset (must be 0 for current implementation) - * @param dtype Data type identifier (supports FLOAT32 and BFLOAT16 from - * SupportedDTypes) - * @param device_type Device type (CPU=0, CUDA=1 from SupportedDevices) - * @param device_index Device index (must be 0 for current implementation) - * @param ret_new_tensor Output parameter for the created tensor (must not be - * null) + * @param sizes_ptr Pointer to array of dimension sizes + * @param strides_ptr Pointer to array of strides for each dimension + * @param storage_offset Storage offset in number of elements + * @param dtype Data type identifier (matches PyTorch scalar types) + * @param device_type Device type (CPU=0, CUDA=1) + * @param device_index Device index + * @param ret_new_tensor Output parameter for the created tensor * @param layout Tensor layout identifier (0=strided) * @param opaque_metadata Optional metadata pointer (can be null) * @param opaque_metadata_size Size of opaque metadata in bytes - * @return AOTITorchError error code (Error::Ok on success, or an error code on - * failure) + * @return AOTITorchError error code (Error::Ok on success) */ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2( void* data, @@ -53,24 +54,23 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2( int32_t dtype, int32_t device_type, int32_t device_index, - Tensor** ret_new_tensor, + SlimTensor** ret_new_tensor, int32_t layout, const uint8_t* opaque_metadata, int64_t opaque_metadata_size); /** * Creates an uninitialized tensor with specified dimensions, strides, and - * dtyper on either CPU or CUDA device. + * dtype on either CPU or CUDA device. * * @param ndim Number of dimensions in the tensor * @param sizes_ptr Pointer to array of dimension sizes * @param strides_ptr Pointer to array of strides for each dimension * @param dtype Data type identifier (matches PyTorch scalar types) * @param device_type Device type (0=CPU, 1=CUDA) - * @param device_index Device index (must be 0 for current implementation) + * @param device_index Device index * @param ret_new_tensor Output parameter for the created tensor - * @return AOTITorchError error code (Error::Ok on success, or an error code on - * failure) + * @return AOTITorchError error code (Error::Ok on success) */ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_empty_strided( int64_t ndim, @@ -79,129 +79,99 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_empty_strided( int32_t dtype, int32_t device_type, int32_t device_index, - Tensor** ret_new_tensor); + SlimTensor** ret_new_tensor); /** - * Deletes a tensor object and frees its associated memory. + * Deletes a tensor object and frees associated resources. * - * @param tensor Pointer to the tensor object to be deleted - * @return AOTITorchError error code (Error::Ok on success, or an error code on - * failure) + * For SlimTensor, the underlying storage uses SharedPtr-based reference + * counting. When the last tensor referencing the storage is deleted, + * the memory is automatically freed. + * + * @param tensor Pointer to the tensor to delete (must not be null) + * @return AOTITorchError error code (Error::Ok on success) */ -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_tensor_object(SlimTensor* tensor); /** - * Creates a tensor view that reinterprets the same underlying memory with - * different shape and strides without copying data. + * Creates a new tensor handle that shares storage with the original tensor. * - * Note that the new tensor will not have the ownership of the underlying - * memory. + * The new handle is a copy of the original tensor's metadata (sizes, strides, + * dtype, device) and shares the same underlying storage via SharedPtr. + * Both tensors will reference the same memory, and the memory will only be + * freed when all references are deleted. * - * @param self Input tensor whose memory will be reinterpreted - * @param ndim Number of dimensions for the new tensor view - * @param sizes_ptr Array of sizes for each dimension - * @param strides_ptr Array of strides for each dimension (or nullptr for - * contiguous) - * @param storage_offset Storage offset (must be 0) - * @param ret_new_tensor Output pointer to store the new tensor view + * @param orig_handle Pointer to the original tensor (must not be null) + * @param new_handle Output parameter for the new tensor handle + * @return AOTITorchError error code (Error::Ok on success) + */ +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_new_tensor_handle(SlimTensor* orig_handle, SlimTensor** new_handle); + +/** + * Creates a reinterpreted view of a tensor with new sizes, strides, and offset. + * + * This is equivalent to torch.as_strided() - it creates a new tensor that + * shares the same underlying storage but with different view parameters. * - * @return Error::Ok on success, appropriate error code on failure + * @param self Original tensor to reinterpret (must not be null) + * @param ndim Number of dimensions for the new view + * @param sizes_ptr Pointer to array of dimension sizes + * @param strides_ptr Pointer to array of strides for each dimension + * @param storage_offset Storage offset in number of elements + * @param ret_new_tensor Output parameter for the reinterpreted tensor view + * @return AOTITorchError error code (Error::Ok on success) */ AOTI_SHIM_EXPORT AOTITorchError aoti_torch__reinterpret_tensor( - Tensor* self, + SlimTensor* self, int64_t ndim, const int64_t* sizes_ptr, const int64_t* strides_ptr, int64_t storage_offset, - Tensor** ret_new_tensor); + SlimTensor** ret_new_tensor); /** * Copies data from source tensor to destination tensor. * - * This function implements copy function for tensors living in CUDA AOTI - * backend. It supports copying between tensors with different shapes (as long - * as they have the same total number of elements) and different memory - * layouts/strides. - * - * Note that currently this function does not support copying between tensors - * with different dtypes. - * - * @param self Destination tensor (data will be overwritten) - * @param src Source tensor (data will be copied from this tensor) - * @param non_blocking Whether the copy should be non-blocking (currently - * ignored) - * - * @return Error::Ok on success, appropriate error code on failure: - * - Error::InvalidArgument: null pointers, dtype mismatch, numel - * mismatch - * - Error::MemoryAllocationFailed: failed to allocate temporary memory - * - Error::Internal: CUDA operation failures - */ -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking); - -/** - * Creates a new tensor handle from an existing one. - * - * This function creates a new tensor object that shares the same underlying - * memory as the original tensor. Similar to PyTorch's Tensor copy constructor, - * it creates a new handle/reference to the same data without performing a deep - * copy. - * - * The new tensor will: - * - Share the same memory/storage as the original tensor - * - Have the same shape, strides, and dtype as the original - * - Increment the reference count for the underlying memory (if owned) - * - * @param orig_handle Original tensor to create a new handle from (must not be - * null) - * @param new_handle Output pointer to store the new tensor handle (must not be - * null) + * Handles all device combinations (CPU-CPU, CPU-CUDA, CUDA-CPU, CUDA-CUDA) + * and supports tensors with different strides. The destination tensor must + * already be allocated with sufficient storage. * - * @return Error::Ok on success, appropriate error code on failure: - * - Error::InvalidArgument: null pointers or invalid parameters + * @param self Destination tensor (must not be null) + * @param src Source tensor to copy from (must not be null) + * @param non_blocking If true, the copy may be asynchronous (currently ignored) + * @return AOTITorchError error code (Error::Ok on success) */ AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle); +aoti_torch_copy_(SlimTensor* self, SlimTensor* src, int32_t non_blocking); /** - * Retrieves a boolean value from a 0D boolean tensor. + * Extracts a boolean scalar value from a single-element tensor. * - * This function extracts the scalar boolean value from a tensor that contains - * a single boolean element. The tensor can be on either CPU or CUDA device. - * For CUDA tensors, the value is copied from device to host memory. + * The tensor must contain exactly one element and have Bool dtype. + * For CUDA tensors, this will synchronize to copy the value to CPU. * - * @param tensor Pointer to a 0D boolean tensor (must not be null) - * @param ret_value Output pointer to store the boolean value (must not be null) - * - * @return Error::Ok on success, appropriate error code on failure: - * - Error::InvalidArgument: null pointers or tensor dtype is not bool + * @param tensor Single-element boolean tensor (must not be null) + * @param ret_value Output parameter for the extracted boolean value + * @return AOTITorchError error code (Error::Ok on success) */ AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_item_bool(Tensor* tensor, bool* ret_value); +aoti_torch_item_bool(SlimTensor* tensor, bool* ret_value); /** - * Creates a new tensor that shares the same underlying data as the source - * tensor. - * - * This function creates a new tensor view with the same shape, strides, and - * dtype as the source tensor, sharing the same underlying memory. The new - * tensor handle will be stored in ret_dst. + * Moves a tensor into a new handle and assigns it to the output parameter. * - * @param src The source tensor providing the data and metadata. - * @param ret_dst On output, this will point to the new tensor view. + * Unlike aoti_torch_new_tensor_handle which copies, this function moves the + * source tensor into the destination. After this operation, the source tensor + * is left in an undefined/reset state and should not be used. * - * @return Error::Ok on success, appropriate error code on failure: - * - Error::InvalidArgument: null pointers or memory not tracked + * @param src Source tensor to move from (must not be null, will be reset) + * @param ret_dst Output parameter for the new tensor handle + * @return AOTITorchError error code (Error::Ok on success) */ AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst); - -// Function to clear all tensors from internal storage -AOTI_SHIM_EXPORT void clear_all_tensors(); +aoti_torch_assign_tensors_out(SlimTensor* src, SlimTensor** ret_dst); -// Function to clear memory tracking map (for test cleanup) -AOTI_SHIM_EXPORT void clear_memory_tracking(); } // extern "C" } // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/memory_slim.cpp b/backends/cuda/runtime/shims/memory_slim.cpp deleted file mode 100644 index 58bf43b34b0..00000000000 --- a/backends/cuda/runtime/shims/memory_slim.cpp +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include -#include -#include - -namespace executorch::backends::cuda { - -namespace c10 = executorch::backends::aoti::slim::c10; -using c10::Device; -using c10::DeviceIndex; -using c10::DeviceType; -using c10::ScalarType; -using executorch::backends::aoti::slim::empty_strided; -using executorch::backends::aoti::slim::from_blob; -using executorch::backends::aoti::slim::IntArrayRef; - -extern "C" { - -AOTITorchError aoti_torch_create_tensor_from_blob_v2( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, - int32_t device_index, - Tensor** ret_new_tensor, - int32_t layout, - const uint8_t* opaque_metadata, - int64_t opaque_metadata_size) { - // Unused parameters - (void)layout; - (void)opaque_metadata; - (void)opaque_metadata_size; - - ET_CHECK_OR_RETURN_ERROR( - data != nullptr, - InvalidArgument, - "aoti_torch_create_tensor_from_blob_v2: data is null"); - - ET_CHECK_OR_RETURN_ERROR( - ret_new_tensor != nullptr, - InvalidArgument, - "aoti_torch_create_tensor_from_blob_v2: ret_new_tensor is null"); - - ET_CHECK_OR_RETURN_ERROR( - !(sizes_ptr == nullptr && ndim > 0), - InvalidArgument, - "aoti_torch_create_tensor_from_blob_v2: sizes_ptr is null but ndim > 0"); - - IntArrayRef sizes(sizes_ptr, static_cast(ndim)); - IntArrayRef strides(strides_ptr, static_cast(ndim)); - - // Create the SlimTensor using from_blob (non-owning) - *ret_new_tensor = new Tensor(from_blob( - data, - sizes, - strides, - static_cast(dtype), - Device( - static_cast(device_type), - static_cast(device_index)), - storage_offset)); - - return Error::Ok; -} - -AOTITorchError aoti_torch_empty_strided( - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int32_t dtype, - int32_t device_type, - int32_t device_index, - Tensor** ret_new_tensor) { - ET_CHECK_OR_RETURN_ERROR( - ret_new_tensor != nullptr, - InvalidArgument, - "aoti_torch_empty_strided: ret_new_tensor is null"); - - ET_CHECK_OR_RETURN_ERROR( - !(sizes_ptr == nullptr && ndim > 0), - InvalidArgument, - "aoti_torch_empty_strided: sizes_ptr is null but ndim > 0"); - - IntArrayRef sizes(sizes_ptr, static_cast(ndim)); - IntArrayRef strides(strides_ptr, static_cast(ndim)); - - // Create the SlimTensor using empty_strided (owning) - *ret_new_tensor = new Tensor(empty_strided( - sizes, - strides, - static_cast(dtype), - Device( - static_cast(device_type), - static_cast(device_index)))); - - return Error::Ok; -} - -AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) { - ET_CHECK_OR_RETURN_ERROR( - tensor != nullptr, - InvalidArgument, - "aoti_torch_delete_tensor_object: tensor is null"); - - // SlimTensor uses SharedPtr for storage, so simply deleting the tensor - // will automatically handle reference counting and free the underlying - // storage when no more references exist. - delete tensor; - - return Error::Ok; -} - -AOTITorchError aoti_torch_new_tensor_handle( - Tensor* orig_handle, - Tensor** new_handle) { - ET_CHECK_OR_RETURN_ERROR( - orig_handle != nullptr, - InvalidArgument, - "aoti_torch_new_tensor_handle: orig_handle is null"); - - ET_CHECK_OR_RETURN_ERROR( - new_handle != nullptr, - InvalidArgument, - "aoti_torch_new_tensor_handle: new_handle is null"); - - // Create a new SlimTensor that shares the same underlying storage. - // SlimTensor's copy constructor shares the SharedPtr, so both - // tensors will reference the same memory. When the last tensor is deleted, - // the storage will be freed. - *new_handle = new Tensor(*orig_handle); - - return Error::Ok; -} - -AOTITorchError aoti_torch__reinterpret_tensor( - Tensor* self, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - Tensor** ret_new_tensor) { - ET_CHECK_OR_RETURN_ERROR( - self != nullptr, - InvalidArgument, - "aoti_torch__reinterpret_tensor: self is null"); - - ET_CHECK_OR_RETURN_ERROR( - ret_new_tensor != nullptr, - InvalidArgument, - "aoti_torch__reinterpret_tensor: ret_new_tensor is null"); - - ET_CHECK_OR_RETURN_ERROR( - ndim >= 0, - InvalidArgument, - "aoti_torch__reinterpret_tensor: ndim must be non-negative, got %lld", - static_cast(ndim)); - - ET_CHECK_OR_RETURN_ERROR( - !(sizes_ptr == nullptr && ndim > 0), - InvalidArgument, - "aoti_torch__reinterpret_tensor: sizes_ptr is null but ndim > 0"); - - IntArrayRef sizes(sizes_ptr, static_cast(ndim)); - IntArrayRef strides(strides_ptr, static_cast(ndim)); - - // Create a new tensor view using as_strided. This creates a tensor that - // shares the same underlying storage but with different sizes, strides, - // and storage offset. SlimTensor::as_strided() handles this via copy - // constructor which shares the SharedPtr. - *ret_new_tensor = - new Tensor(self->as_strided(sizes, strides, storage_offset)); - - return Error::Ok; -} - -AOTITorchError -aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) { - (void)non_blocking; // SlimTensor::copy_() is always synchronous for now - - ET_CHECK_OR_RETURN_ERROR( - self != nullptr, InvalidArgument, "aoti_torch_copy_: self is null"); - - ET_CHECK_OR_RETURN_ERROR( - src != nullptr, InvalidArgument, "aoti_torch_copy_: src is null"); - - // SlimTensor::copy_() handles: - // - Same numel validation - // - Same dtype validation - // - CPU-CPU, CPU-CUDA, CUDA-CPU, CUDA-CUDA copies - // - Contiguous fast path and non-contiguous element-wise copy - self->copy_(*src); - - return Error::Ok; -} - -AOTITorchError aoti_torch_item_bool(Tensor* tensor, bool* ret_value) { - ET_CHECK_OR_RETURN_ERROR( - tensor != nullptr, - InvalidArgument, - "aoti_torch_item_bool: tensor is null"); - - ET_CHECK_OR_RETURN_ERROR( - ret_value != nullptr, - InvalidArgument, - "aoti_torch_item_bool: ret_value is null"); - - ET_CHECK_OR_RETURN_ERROR( - tensor->numel() == 1, - InvalidArgument, - "aoti_torch_item_bool: tensor must have exactly 1 element, got %zu", - tensor->numel()); - - ET_CHECK_OR_RETURN_ERROR( - tensor->dtype() == ScalarType::Bool, - InvalidArgument, - "aoti_torch_item_bool: tensor dtype must be Bool"); - - // SlimTensor::item() handles both CPU and CUDA tensors. - // For CUDA tensors, it copies the value to CPU automatically. - *ret_value = tensor->item(); - - return Error::Ok; -} - -AOTITorchError aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst) { - ET_CHECK_OR_RETURN_ERROR( - src != nullptr, - InvalidArgument, - "aoti_torch_assign_tensors_out: src is null"); - - ET_CHECK_OR_RETURN_ERROR( - ret_dst != nullptr, - InvalidArgument, - "aoti_torch_assign_tensors_out: ret_dst is null"); - - // Move the source tensor into the destination. After this operation, - // the source tensor will be left in an undefined state (reset). - // This differs from aoti_torch_new_tensor_handle which copies the tensor. - *ret_dst = new Tensor(std::move(*src)); - - return Error::Ok; -} - -} // extern "C" - -} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/memory_slim.h b/backends/cuda/runtime/shims/memory_slim.h deleted file mode 100644 index 5a0845f243c..00000000000 --- a/backends/cuda/runtime/shims/memory_slim.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include -#include -#include - -namespace executorch::backends::cuda { - -using executorch::runtime::Error; -using AOTITorchError = Error; -using Tensor = executorch::backends::aoti::slim::SlimTensor; - -extern "C" { - -/** - * Creates a tensor object from an existing memory blob without copying the - * data. The tensor will wrap the provided memory and will not take ownership of - * it. When the tensor is deleted, the original memory will remain valid and - * must be freed by the caller. - * - * @param data Pointer to the memory blob to wrap (must not be null) - * @param ndim Number of dimensions in the tensor - * @param sizes_ptr Pointer to array of dimension sizes - * @param strides_ptr Pointer to array of strides for each dimension - * @param storage_offset Storage offset in number of elements - * @param dtype Data type identifier (matches PyTorch scalar types) - * @param device_type Device type (CPU=0, CUDA=1) - * @param device_index Device index - * @param ret_new_tensor Output parameter for the created tensor - * @param layout Tensor layout identifier (0=strided) - * @param opaque_metadata Optional metadata pointer (can be null) - * @param opaque_metadata_size Size of opaque metadata in bytes - * @return AOTITorchError error code (Error::Ok on success) - */ -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, - int32_t device_index, - Tensor** ret_new_tensor, - int32_t layout, - const uint8_t* opaque_metadata, - int64_t opaque_metadata_size); - -/** - * Creates an uninitialized tensor with specified dimensions, strides, and - * dtype on either CPU or CUDA device. - * - * @param ndim Number of dimensions in the tensor - * @param sizes_ptr Pointer to array of dimension sizes - * @param strides_ptr Pointer to array of strides for each dimension - * @param dtype Data type identifier (matches PyTorch scalar types) - * @param device_type Device type (0=CPU, 1=CUDA) - * @param device_index Device index - * @param ret_new_tensor Output parameter for the created tensor - * @return AOTITorchError error code (Error::Ok on success) - */ -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_empty_strided( - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int32_t dtype, - int32_t device_type, - int32_t device_index, - Tensor** ret_new_tensor); - -/** - * Deletes a tensor object and frees associated resources. - * - * For SlimTensor, the underlying storage uses SharedPtr-based reference - * counting. When the last tensor referencing the storage is deleted, - * the memory is automatically freed. - * - * @param tensor Pointer to the tensor to delete (must not be null) - * @return AOTITorchError error code (Error::Ok on success) - */ -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); - -/** - * Creates a new tensor handle that shares storage with the original tensor. - * - * The new handle is a copy of the original tensor's metadata (sizes, strides, - * dtype, device) and shares the same underlying storage via SharedPtr. - * Both tensors will reference the same memory, and the memory will only be - * freed when all references are deleted. - * - * @param orig_handle Pointer to the original tensor (must not be null) - * @param new_handle Output parameter for the new tensor handle - * @return AOTITorchError error code (Error::Ok on success) - */ -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle); - -/** - * Creates a reinterpreted view of a tensor with new sizes, strides, and offset. - * - * This is equivalent to torch.as_strided() - it creates a new tensor that - * shares the same underlying storage but with different view parameters. - * - * @param self Original tensor to reinterpret (must not be null) - * @param ndim Number of dimensions for the new view - * @param sizes_ptr Pointer to array of dimension sizes - * @param strides_ptr Pointer to array of strides for each dimension - * @param storage_offset Storage offset in number of elements - * @param ret_new_tensor Output parameter for the reinterpreted tensor view - * @return AOTITorchError error code (Error::Ok on success) - */ -AOTI_SHIM_EXPORT AOTITorchError aoti_torch__reinterpret_tensor( - Tensor* self, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - Tensor** ret_new_tensor); - -/** - * Copies data from source tensor to destination tensor. - * - * Handles all device combinations (CPU-CPU, CPU-CUDA, CUDA-CPU, CUDA-CUDA) - * and supports tensors with different strides. The destination tensor must - * already be allocated with sufficient storage. - * - * @param self Destination tensor (must not be null) - * @param src Source tensor to copy from (must not be null) - * @param non_blocking If true, the copy may be asynchronous (currently ignored) - * @return AOTITorchError error code (Error::Ok on success) - */ -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking); - -/** - * Extracts a boolean scalar value from a single-element tensor. - * - * The tensor must contain exactly one element and have Bool dtype. - * For CUDA tensors, this will synchronize to copy the value to CPU. - * - * @param tensor Single-element boolean tensor (must not be null) - * @param ret_value Output parameter for the extracted boolean value - * @return AOTITorchError error code (Error::Ok on success) - */ -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_item_bool(Tensor* tensor, bool* ret_value); - -/** - * Moves a tensor into a new handle and assigns it to the output parameter. - * - * Unlike aoti_torch_new_tensor_handle which copies, this function moves the - * source tensor into the destination. After this operation, the source tensor - * is left in an undefined/reset state and should not be used. - * - * @param src Source tensor to move from (must not be null, will be reset) - * @param ret_dst Output parameter for the new tensor handle - * @return AOTITorchError error code (Error::Ok on success) - */ -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst); - -} // extern "C" - -} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/tests/CMakeLists.txt b/backends/cuda/runtime/shims/tests/CMakeLists.txt index 204c08688c4..291e3052bbd 100644 --- a/backends/cuda/runtime/shims/tests/CMakeLists.txt +++ b/backends/cuda/runtime/shims/tests/CMakeLists.txt @@ -35,7 +35,7 @@ endif() # Find installed ExecuTorch find_package(executorch CONFIG REQUIRED HINTS ${CMAKE_INSTALL_PREFIX}) -# List of test files +# List of SlimTensor-based test files (now the primary tests) set(CUDA_SHIM_TESTS test_aoti_torch_create_tensor_from_blob_v2 test_aoti_torch_empty_strided @@ -49,6 +49,7 @@ set(CUDA_SHIM_TESTS enable_testing() +# Build SlimTensor-based tests foreach(test_name ${CUDA_SHIM_TESTS}) add_executable(${test_name} ${test_name}.cpp) @@ -57,16 +58,15 @@ foreach(test_name ${CUDA_SHIM_TESTS}) ${CUDAToolkit_INCLUDE_DIRS} ) + target_compile_definitions(${test_name} PRIVATE CUDA_AVAILABLE=1) + target_link_libraries( ${test_name} PRIVATE GTest::gtest GTest::gtest_main aoti_cuda_shims - aoti_cuda_backend - cuda_tensor_maker - cuda_platform + slimtensor executorch_core - extension_tensor CUDA::cudart ) diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl index a6b18eba4c8..04f7aa2f963 100644 --- a/backends/cuda/runtime/shims/tests/targets.bzl +++ b/backends/cuda/runtime/shims/tests/targets.bzl @@ -3,35 +3,12 @@ load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest") load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils") def cuda_shim_cpp_unittest(name): + """Unittest for SlimTensor-based shim functions.""" cpp_unittest( name = "test_" + name, srcs = [ "test_" + name + ".cpp", ], - deps = [ - "//executorch/backends/aoti:common_shims", - "//executorch/backends/cuda/runtime:runtime_shims", - "//executorch/extension/tensor:tensor", - "//executorch/runtime/core:core", - "//executorch/runtime/platform:platform", - "//executorch/runtime/core/exec_aten:lib", - ], - external_deps = [ - ("cuda", None, "cuda-lazy"), - ], - keep_gpu_sections = True, - remote_execution = re_test_utils.remote_execution( - platform = "gpu-remote-execution", - ), - ) - -def cuda_shim_slim_cpp_unittest(name): - """Unittest for SlimTensor-based shim functions.""" - cpp_unittest( - name = "test_" + name + "_slim", - srcs = [ - "test_" + name + "_slim.cpp", - ], deps = [ "//executorch/backends/cuda/runtime:runtime_shims_slim", "//executorch/backends/aoti:common_shims", @@ -58,24 +35,12 @@ def define_common_targets(): The directory containing this targets.bzl file should also contain both TARGETS and BUCK files that call this function. """ - # Original ETensor-based shim tests, will be removed after migration + # SlimTensor-based shim tests (now the primary tests) cuda_shim_cpp_unittest("aoti_torch_empty_strided") - cuda_shim_cpp_unittest("aoti_torch_delete_tensor_object") cuda_shim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2") + cuda_shim_cpp_unittest("aoti_torch_delete_tensor_object") + cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle") cuda_shim_cpp_unittest("aoti_torch__reinterpret_tensor") cuda_shim_cpp_unittest("aoti_torch_copy_") - cuda_shim_cpp_unittest("aoti_torch_cuda_guard") - cuda_shim_cpp_unittest("aoti_torch_cuda__weight_int4pack_mm") - cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle") cuda_shim_cpp_unittest("aoti_torch_item_bool") cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out") - - # SlimTensor-based shim tests - cuda_shim_slim_cpp_unittest("aoti_torch_empty_strided") - cuda_shim_slim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2") - cuda_shim_slim_cpp_unittest("aoti_torch_delete_tensor_object") - cuda_shim_slim_cpp_unittest("aoti_torch_new_tensor_handle") - cuda_shim_slim_cpp_unittest("aoti_torch__reinterpret_tensor") - cuda_shim_slim_cpp_unittest("aoti_torch_copy_") - cuda_shim_slim_cpp_unittest("aoti_torch_item_bool") - cuda_shim_slim_cpp_unittest("aoti_torch_assign_tensors_out") diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp index d3044810b15..d2ad645136e 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp @@ -7,806 +7,686 @@ */ #include -#include -#include -#include -#include -#include -#include -#include -#include #include #include -using namespace executorch::backends::aoti; -using namespace executorch::backends::cuda; -using namespace executorch::runtime; -using executorch::runtime::etensor::Tensor; - -// Test fixture for aoti_torch__reinterpret_tensor tests -class AOTITorchReinterpretTensorTest : public ::testing::Test { - protected: - void SetUp() override { - // Initialize ExecuTorch Platform Abstraction Layer - et_pal_init(); +#include +#include +#include +#include +#include - // Check if CUDA is available - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - if (err != cudaSuccess || device_count == 0) { - GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; - } +using namespace executorch::backends::cuda; +using executorch::runtime::Error; - // Clean up any existing cached metadata before each test - cleanup_tensor_metadata(); +namespace slim_c10 = executorch::backends::aoti::slim::c10; - // Clear any remaining tensors from previous tests - clear_all_tensors(); - } +namespace { - void TearDown() override { - // Clean up metadata - cleanup_tensor_metadata(); +bool isCudaAvailable() { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + return (err == cudaSuccess && device_count > 0); +} - // Clear the global tensor storage using the provided function - clear_all_tensors(); +std::vector calculateContiguousStrides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; } - - // Helper to calculate number of elements from sizes - int64_t calculate_numel(const std::vector& sizes) { - int64_t numel = 1; - for (int64_t size : sizes) { - numel *= size; - } - return numel; + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; } + return strides; +} - // Helper to calculate contiguous strides from sizes - std::vector calculate_contiguous_strides( - const std::vector& sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) { - return strides; - } +} // namespace - strides[sizes.size() - 1] = 1; - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - return strides; +class AOTITorchReinterpretTensorSlimTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); } - // Helper to create a source tensor using empty_strided (which allocates new - // memory) - Tensor* create_source_tensor( + Tensor* createTestTensor( const std::vector& sizes, - int32_t dtype = 6, // float32 - int32_t device_type = 1, // CUDA + const std::vector& strides = {}, + int32_t dtype = static_cast(slim_c10::ScalarType::Float), + int32_t device_type = static_cast(slim_c10::DeviceType::CPU), int32_t device_index = 0) { - std::vector strides = calculate_contiguous_strides(sizes); + Tensor* tensor = nullptr; + + std::vector effective_strides = strides; + if (strides.empty()) { + effective_strides = calculateContiguousStrides(sizes); + } - Tensor* tensor; AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), - strides.data(), + effective_strides.data(), dtype, device_type, device_index, &tensor); - if (error != Error::Ok) { - return nullptr; - } - - return tensor; + return (error == Error::Ok) ? tensor : nullptr; } - - private: - std::vector cuda_memory_buffers_; - std::vector cpu_memory_buffers_; }; -// Test basic functionality: reinterpret tensor with different shapes -TEST_F(AOTITorchReinterpretTensorTest, BasicReinterpretation) { - // Create a source tensor with shape [12] (1D with 12 elements) - std::vector source_sizes = {12}; - Tensor* source_tensor = create_source_tensor(source_sizes); - ASSERT_NE(source_tensor, nullptr); - - // Store the original data pointer - void* original_data_ptr = source_tensor->mutable_data_ptr(); - ASSERT_NE(original_data_ptr, nullptr); +// ============================================================================ +// Basic Functionality Tests +// ============================================================================ + +TEST_F(AOTITorchReinterpretTensorSlimTest, BasicView_CPU) { + std::vector sizes = {2, 3, 4}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(orig_tensor, nullptr); - // Reinterpret as [3, 4] (2D with same number of elements) - std::vector new_sizes = {3, 4}; - std::vector new_strides = calculate_contiguous_strides(new_sizes); + std::vector new_sizes = {6, 4}; + std::vector new_strides = {4, 1}; + int64_t storage_offset = 0; - Tensor* reinterpreted_tensor; + Tensor* view_tensor = nullptr; AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, + orig_tensor, new_sizes.size(), new_sizes.data(), new_strides.data(), - 0, // storage_offset - &reinterpreted_tensor); + storage_offset, + &view_tensor); EXPECT_EQ(error, Error::Ok); - ASSERT_NE(reinterpreted_tensor, nullptr); - - // Check that the reinterpreted tensor has the new shape - EXPECT_EQ(reinterpreted_tensor->dim(), 2); - EXPECT_EQ(reinterpreted_tensor->size(0), 3); - EXPECT_EQ(reinterpreted_tensor->size(1), 4); - - // CRITICAL: Check that the reinterpreted tensor uses the SAME memory - void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr(); - EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr) - << "Reinterpreted tensor should use the same memory as the source tensor"; - - // Write data through the original tensor and verify it's visible through the - // reinterpreted tensor - std::vector test_data = { - 1.0f, - 2.0f, - 3.0f, - 4.0f, - 5.0f, - 6.0f, - 7.0f, - 8.0f, - 9.0f, - 10.0f, - 11.0f, - 12.0f}; - cudaError_t cuda_err = cudaMemcpy( - original_data_ptr, - test_data.data(), - test_data.size() * sizeof(float), - cudaMemcpyHostToDevice); - EXPECT_EQ(cuda_err, cudaSuccess); - - // Read back through the reinterpreted tensor - std::vector readback_data(12); - cuda_err = cudaMemcpy( - readback_data.data(), - reinterpreted_data_ptr, - readback_data.size() * sizeof(float), - cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess); - - // Verify the data matches - for (size_t i = 0; i < test_data.size(); i++) { - EXPECT_EQ(readback_data[i], test_data[i]) - << "Data should be the same through both tensors at index " << i; - } -} - -// Test reinterpreting with different strides -TEST_F(AOTITorchReinterpretTensorTest, ReinterpretWithCustomStrides) { - // Create a source tensor with shape [2, 6] (contiguous) - std::vector source_sizes = {2, 6}; - Tensor* source_tensor = create_source_tensor(source_sizes); - ASSERT_NE(source_tensor, nullptr); - - void* original_data_ptr = source_tensor->mutable_data_ptr(); - ASSERT_NE(original_data_ptr, nullptr); + ASSERT_NE(view_tensor, nullptr); - // Reinterpret as [3, 4] with custom strides (still valid for the same memory) - std::vector new_sizes = {3, 4}; - std::vector new_strides = {4, 1}; // Row-major strides for [3, 4] + EXPECT_EQ(view_tensor->dim(), 2); + EXPECT_EQ(view_tensor->size(0), 6); + EXPECT_EQ(view_tensor->size(1), 4); + EXPECT_EQ(view_tensor->stride(0), 4); + EXPECT_EQ(view_tensor->stride(1), 1); - Tensor* reinterpreted_tensor; - AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - 0, // storage_offset - &reinterpreted_tensor); + EXPECT_EQ(view_tensor->data_ptr(), orig_tensor->data_ptr()); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(reinterpreted_tensor, nullptr); + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); +} - // Check shape - EXPECT_EQ(reinterpreted_tensor->dim(), 2); - EXPECT_EQ(reinterpreted_tensor->size(0), 3); - EXPECT_EQ(reinterpreted_tensor->size(1), 4); +TEST_F(AOTITorchReinterpretTensorSlimTest, NullSelf) { + std::vector sizes = {2, 3}; + std::vector strides = {3, 1}; - // CRITICAL: Check that the reinterpreted tensor uses the SAME memory - void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr(); - EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr) - << "Reinterpreted tensor should use the same memory as the source tensor"; + Tensor* view_tensor = nullptr; + AOTITorchError error = aoti_torch__reinterpret_tensor( + nullptr, sizes.size(), sizes.data(), strides.data(), 0, &view_tensor); - // Verify strides were set correctly - int64_t* tensor_strides; - error = aoti_torch_get_strides(reinterpreted_tensor, &tensor_strides); - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(tensor_strides[0], 4); - EXPECT_EQ(tensor_strides[1], 1); + EXPECT_EQ(error, Error::InvalidArgument); } -// Test error cases: null input tensor -TEST_F(AOTITorchReinterpretTensorTest, NullInputTensor) { - std::vector new_sizes = {2, 3}; - std::vector new_strides = calculate_contiguous_strides(new_sizes); +TEST_F(AOTITorchReinterpretTensorSlimTest, NullReturnPointer) { + std::vector sizes = {2, 3}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(orig_tensor, nullptr); + + std::vector new_sizes = {6}; + std::vector new_strides = {1}; - Tensor* reinterpreted_tensor; AOTITorchError error = aoti_torch__reinterpret_tensor( - nullptr, // null input tensor + orig_tensor, new_sizes.size(), new_sizes.data(), new_strides.data(), - 0, // storage_offset - &reinterpreted_tensor); + 0, + nullptr); EXPECT_EQ(error, Error::InvalidArgument); + + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); } -// Test error cases: null sizes pointer -TEST_F(AOTITorchReinterpretTensorTest, NullSizesPointer) { - std::vector source_sizes = {6}; - Tensor* source_tensor = create_source_tensor(source_sizes); - ASSERT_NE(source_tensor, nullptr); +TEST_F(AOTITorchReinterpretTensorSlimTest, NegativeNdim) { + std::vector sizes = {2, 3}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(orig_tensor, nullptr); - std::vector new_strides = {2, 1}; + std::vector new_sizes = {6}; + std::vector new_strides = {1}; - Tensor* reinterpreted_tensor; + Tensor* view_tensor = nullptr; AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, - 2, // ndim > 0 - nullptr, // null sizes pointer - new_strides.data(), - 0, // storage_offset - &reinterpreted_tensor); + orig_tensor, -1, new_sizes.data(), new_strides.data(), 0, &view_tensor); EXPECT_EQ(error, Error::InvalidArgument); + + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); } -// Test error cases: null return tensor pointer -TEST_F(AOTITorchReinterpretTensorTest, NullReturnTensorPointer) { - std::vector source_sizes = {6}; - Tensor* source_tensor = create_source_tensor(source_sizes); - ASSERT_NE(source_tensor, nullptr); +// ============================================================================ +// Storage Offset Tests +// ============================================================================ + +TEST_F(AOTITorchReinterpretTensorSlimTest, WithStorageOffset_CPU) { + std::vector sizes = {4, 4}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(orig_tensor, nullptr); - std::vector new_sizes = {2, 3}; - std::vector new_strides = calculate_contiguous_strides(new_sizes); + std::vector new_sizes = {2, 4}; + std::vector new_strides = {4, 1}; + int64_t storage_offset = 4; // Skip first row + Tensor* view_tensor = nullptr; AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, + orig_tensor, new_sizes.size(), new_sizes.data(), new_strides.data(), - 0, // storage_offset - nullptr); // null return tensor pointer + storage_offset, + &view_tensor); - EXPECT_EQ(error, Error::InvalidArgument); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(view_tensor, nullptr); + + EXPECT_EQ(view_tensor->dim(), 2); + EXPECT_EQ(view_tensor->size(0), 2); + EXPECT_EQ(view_tensor->size(1), 4); + + char* orig_ptr = static_cast(orig_tensor->data_ptr()); + char* view_ptr = static_cast(view_tensor->data_ptr()); + EXPECT_EQ(view_ptr, orig_ptr + storage_offset * sizeof(float)); + + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); } -// Test error cases: non-zero storage offset (should fail) -TEST_F(AOTITorchReinterpretTensorTest, NonZeroStorageOffset) { - std::vector source_sizes = {6}; - Tensor* source_tensor = create_source_tensor(source_sizes); - ASSERT_NE(source_tensor, nullptr); +// ============================================================================ +// Memory Sharing Tests +// ============================================================================ + +TEST_F(AOTITorchReinterpretTensorSlimTest, MemorySharing_CPU) { + std::vector sizes = {6}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(orig_tensor, nullptr); + + void* orig_ptr = orig_tensor->data_ptr(); std::vector new_sizes = {2, 3}; - std::vector new_strides = calculate_contiguous_strides(new_sizes); + std::vector new_strides = {3, 1}; - Tensor* reinterpreted_tensor; + Tensor* view_tensor = nullptr; AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, + orig_tensor, new_sizes.size(), new_sizes.data(), new_strides.data(), - 1, // non-zero storage_offset (should fail) - &reinterpreted_tensor); + 0, + &view_tensor); - EXPECT_EQ(error, Error::InvalidArgument); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(view_tensor, nullptr); + + EXPECT_EQ(view_tensor->data_ptr(), orig_ptr); + + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + + EXPECT_EQ(view_tensor->data_ptr(), orig_ptr); + + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); } -// Test reinterpreting CPU tensor -TEST_F(AOTITorchReinterpretTensorTest, ReinterpretCPUTensor) { - // Create a CPU tensor with shape [8] - std::vector source_sizes = {8}; - Tensor* source_tensor = create_source_tensor( - source_sizes, - 6, // float32 - 0, // CPU device +TEST_F(AOTITorchReinterpretTensorSlimTest, MultipleViews_CPU) { + std::vector sizes = {24}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), 0); - ASSERT_NE(source_tensor, nullptr); + ASSERT_NE(orig_tensor, nullptr); - void* original_data_ptr = source_tensor->mutable_data_ptr(); - ASSERT_NE(original_data_ptr, nullptr); + void* orig_ptr = orig_tensor->data_ptr(); - // Reinterpret as [2, 4] - std::vector new_sizes = {2, 4}; - std::vector new_strides = calculate_contiguous_strides(new_sizes); + std::vector sizes1 = {2, 12}; + std::vector strides1 = {12, 1}; + + std::vector sizes2 = {4, 6}; + std::vector strides2 = {6, 1}; + + std::vector sizes3 = {2, 3, 4}; + std::vector strides3 = {12, 4, 1}; + + Tensor* view1 = nullptr; + Tensor* view2 = nullptr; + Tensor* view3 = nullptr; + + EXPECT_EQ( + aoti_torch__reinterpret_tensor( + orig_tensor, + sizes1.size(), + sizes1.data(), + strides1.data(), + 0, + &view1), + Error::Ok); + EXPECT_EQ( + aoti_torch__reinterpret_tensor( + orig_tensor, + sizes2.size(), + sizes2.data(), + strides2.data(), + 0, + &view2), + Error::Ok); + EXPECT_EQ( + aoti_torch__reinterpret_tensor( + orig_tensor, + sizes3.size(), + sizes3.data(), + strides3.data(), + 0, + &view3), + Error::Ok); + + EXPECT_EQ(view1->data_ptr(), orig_ptr); + EXPECT_EQ(view2->data_ptr(), orig_ptr); + EXPECT_EQ(view3->data_ptr(), orig_ptr); + + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + + EXPECT_EQ(view1->data_ptr(), orig_ptr); + EXPECT_EQ(view2->data_ptr(), orig_ptr); + EXPECT_EQ(view3->data_ptr(), orig_ptr); + + EXPECT_EQ(aoti_torch_delete_tensor_object(view1), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view2), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view3), Error::Ok); +} + +// ============================================================================ +// Dimension Change Tests +// ============================================================================ + +TEST_F(AOTITorchReinterpretTensorSlimTest, ExpandDimensions_CPU) { + std::vector sizes = {6}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(orig_tensor, nullptr); + EXPECT_EQ(orig_tensor->dim(), 1); + + std::vector new_sizes = {2, 3}; + std::vector new_strides = {3, 1}; - Tensor* reinterpreted_tensor; + Tensor* view_tensor = nullptr; AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, + orig_tensor, new_sizes.size(), new_sizes.data(), new_strides.data(), - 0, // storage_offset - &reinterpreted_tensor); + 0, + &view_tensor); EXPECT_EQ(error, Error::Ok); - ASSERT_NE(reinterpreted_tensor, nullptr); - - // Check that the reinterpreted tensor uses the SAME memory - void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr(); - EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr) - << "Reinterpreted CPU tensor should use the same memory as the source tensor"; - - // Test direct memory access for CPU tensors - float* original_float_ptr = reinterpret_cast(original_data_ptr); - float* reinterpreted_float_ptr = - reinterpret_cast(reinterpreted_data_ptr); - - // Write through original and read through reinterpreted - original_float_ptr[0] = 42.0f; - EXPECT_EQ(reinterpreted_float_ptr[0], 42.0f) - << "Changes through original tensor should be visible through reinterpreted tensor"; -} + ASSERT_NE(view_tensor, nullptr); + EXPECT_EQ(view_tensor->dim(), 2); -// Test that deleting source tensor doesn't affect reinterpreted tensor (they -// share memory) -TEST_F(AOTITorchReinterpretTensorTest, DeletionBehavior) { - std::vector source_sizes = {6}; - Tensor* source_tensor = create_source_tensor(source_sizes); - ASSERT_NE(source_tensor, nullptr); + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); +} - void* shared_data_ptr = source_tensor->mutable_data_ptr(); +TEST_F(AOTITorchReinterpretTensorSlimTest, CollapseDimensions_CPU) { + std::vector sizes = {2, 3, 4}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(orig_tensor, nullptr); + EXPECT_EQ(orig_tensor->dim(), 3); - // Reinterpret as [2, 3] - std::vector new_sizes = {2, 3}; - std::vector new_strides = calculate_contiguous_strides(new_sizes); + std::vector new_sizes = {24}; + std::vector new_strides = {1}; - Tensor* reinterpreted_tensor; + Tensor* view_tensor = nullptr; AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, + orig_tensor, new_sizes.size(), new_sizes.data(), new_strides.data(), 0, - &reinterpreted_tensor); + &view_tensor); EXPECT_EQ(error, Error::Ok); - ASSERT_NE(reinterpreted_tensor, nullptr); + ASSERT_NE(view_tensor, nullptr); + EXPECT_EQ(view_tensor->dim(), 1); + EXPECT_EQ(view_tensor->numel(), 24); - // Verify they share the same memory - EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), shared_data_ptr); + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); +} - // Delete the source tensor (which owns the memory) - error = aoti_torch_delete_tensor_object(source_tensor); - EXPECT_EQ(error, Error::Ok); +TEST_F(AOTITorchReinterpretTensorSlimTest, ScalarTensorView_CPU) { + std::vector sizes = {1}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(orig_tensor, nullptr); + + std::vector new_sizes = {}; + std::vector new_strides = {}; - // The reinterpreted tensor should still be valid but the memory might be - // freed Since the source tensor owned the memory, the reinterpreted tensor - // becomes invalid This is expected behavior - the user needs to manage the - // lifecycle properly + Tensor* view_tensor = nullptr; + AOTITorchError error = aoti_torch__reinterpret_tensor( + orig_tensor, 0, new_sizes.data(), new_strides.data(), 0, &view_tensor); - // Clean up the reinterpreted tensor - error = aoti_torch_delete_tensor_object(reinterpreted_tensor); EXPECT_EQ(error, Error::Ok); -} + ASSERT_NE(view_tensor, nullptr); + EXPECT_EQ(view_tensor->dim(), 0); + EXPECT_EQ(view_tensor->numel(), 1); -// Test scalar tensor reinterpretation -TEST_F(AOTITorchReinterpretTensorTest, ReinterpretScalarTensor) { - // Create a scalar tensor (0D) - std::vector source_sizes = {}; - Tensor* source_tensor = create_source_tensor(source_sizes); - ASSERT_NE(source_tensor, nullptr); + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); +} - void* original_data_ptr = source_tensor->mutable_data_ptr(); +// ============================================================================ +// Stride Tests +// ============================================================================ + +TEST_F(AOTITorchReinterpretTensorSlimTest, TransposeViaStrides_CPU) { + std::vector sizes = {3, 4}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(orig_tensor, nullptr); - // Try to reinterpret scalar as [1] (1D with 1 element) - std::vector new_sizes = {1}; - std::vector new_strides = {1}; + std::vector new_sizes = {4, 3}; + std::vector new_strides = {1, 4}; - Tensor* reinterpreted_tensor; + Tensor* view_tensor = nullptr; AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, + orig_tensor, new_sizes.size(), new_sizes.data(), new_strides.data(), 0, - &reinterpreted_tensor); + &view_tensor); EXPECT_EQ(error, Error::Ok); - ASSERT_NE(reinterpreted_tensor, nullptr); - - // Check that the reinterpreted tensor uses the SAME memory - EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr); + ASSERT_NE(view_tensor, nullptr); + EXPECT_EQ(view_tensor->size(0), 4); + EXPECT_EQ(view_tensor->size(1), 3); + EXPECT_EQ(view_tensor->stride(0), 1); + EXPECT_EQ(view_tensor->stride(1), 4); - // Check new shape - EXPECT_EQ(reinterpreted_tensor->dim(), 1); - EXPECT_EQ(reinterpreted_tensor->size(0), 1); + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); } -// Test reinterpreting tensor with zero-sized dimension -// TODO: This test is disabled because zero-sized tensors have complex stride -// validation requirements that need further investigation -TEST_F(AOTITorchReinterpretTensorTest, DISABLED_ReinterpretZeroSizedTensor) { - // Create a tensor with shape [0, 5] (zero elements) - std::vector source_sizes = {0, 5}; - Tensor* source_tensor = create_source_tensor(source_sizes); - ASSERT_NE(source_tensor, nullptr); - - void* original_data_ptr = source_tensor->mutable_data_ptr(); +// ============================================================================ +// Different Dtype Tests +// ============================================================================ + +TEST_F(AOTITorchReinterpretTensorSlimTest, Int64Tensor_CPU) { + std::vector sizes = {6}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Long), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(orig_tensor, nullptr); - // Reinterpret as [5, 0] (still zero elements) - std::vector new_sizes = {5, 0}; - std::vector new_strides = calculate_contiguous_strides(new_sizes); + std::vector new_sizes = {2, 3}; + std::vector new_strides = {3, 1}; - Tensor* reinterpreted_tensor; + Tensor* view_tensor = nullptr; AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, + orig_tensor, new_sizes.size(), new_sizes.data(), new_strides.data(), 0, - &reinterpreted_tensor); + &view_tensor); EXPECT_EQ(error, Error::Ok); - ASSERT_NE(reinterpreted_tensor, nullptr); - - // Check that the reinterpreted tensor uses the SAME memory - EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr); + ASSERT_NE(view_tensor, nullptr); + EXPECT_EQ(view_tensor->itemsize(), 8); - // Check new shape - EXPECT_EQ(reinterpreted_tensor->dim(), 2); - EXPECT_EQ(reinterpreted_tensor->size(0), 5); - EXPECT_EQ(reinterpreted_tensor->size(1), 0); + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); } -// Test with nullptr strides (should use contiguous strides) -TEST_F(AOTITorchReinterpretTensorTest, NullStridesPointer) { - std::vector source_sizes = {12}; - Tensor* source_tensor = create_source_tensor(source_sizes); - ASSERT_NE(source_tensor, nullptr); - - void* original_data_ptr = source_tensor->mutable_data_ptr(); +TEST_F(AOTITorchReinterpretTensorSlimTest, BFloat16Tensor_CPU) { + std::vector sizes = {6}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::BFloat16), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(orig_tensor, nullptr); - // Reinterpret as [3, 4] with null strides (should calculate contiguous - // strides) - std::vector new_sizes = {3, 4}; + std::vector new_sizes = {2, 3}; + std::vector new_strides = {3, 1}; - Tensor* reinterpreted_tensor; + Tensor* view_tensor = nullptr; AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, + orig_tensor, new_sizes.size(), new_sizes.data(), - nullptr, // null strides - should calculate contiguous strides + new_strides.data(), 0, - &reinterpreted_tensor); + &view_tensor); EXPECT_EQ(error, Error::Ok); - ASSERT_NE(reinterpreted_tensor, nullptr); - - // Check that the reinterpreted tensor uses the SAME memory - EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr); + ASSERT_NE(view_tensor, nullptr); + EXPECT_EQ(view_tensor->itemsize(), 2); - // Check that contiguous strides were calculated correctly - int64_t* tensor_strides; - error = aoti_torch_get_strides(reinterpreted_tensor, &tensor_strides); - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(tensor_strides[0], 4); // stride for dimension 0 should be 4 - EXPECT_EQ(tensor_strides[1], 1); // stride for dimension 1 should be 1 + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); } -// Test bf16 tensor reinterpretation -TEST_F(AOTITorchReinterpretTensorTest, ReinterpretBF16Tensor) { - // Create a bf16 source tensor with shape [6] - std::vector source_sizes = {6}; - Tensor* source_tensor = create_source_tensor( - source_sizes, - static_cast( - SupportedDTypes::BFLOAT16), // bf16 dtype from SupportedDTypes - static_cast( - SupportedDevices::CUDA), // CUDA device from SupportedDevices - 0); // device_index must be 0 - ASSERT_NE(source_tensor, nullptr); - - void* original_data_ptr = source_tensor->mutable_data_ptr(); - ASSERT_NE(original_data_ptr, nullptr); - - // Verify the tensor is actually bf16 - int32_t actual_dtype = 0; - AOTITorchError dtype_check_error = - aoti_torch_get_dtype(source_tensor, &actual_dtype); - EXPECT_EQ(dtype_check_error, Error::Ok); - EXPECT_EQ(actual_dtype, static_cast(SupportedDTypes::BFLOAT16)) - << "Source tensor should have bfloat16 dtype"; - - // Reinterpret as [2, 3] (same number of elements) - std::vector new_sizes = {2, 3}; - std::vector new_strides = calculate_contiguous_strides(new_sizes); +// ============================================================================ +// CUDA Tests +// ============================================================================ + +TEST_F(AOTITorchReinterpretTensorSlimTest, BasicView_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } - Tensor* reinterpreted_tensor; + std::vector sizes = {2, 3, 4}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(orig_tensor, nullptr); + EXPECT_TRUE(orig_tensor->is_cuda()); + + std::vector new_sizes = {6, 4}; + std::vector new_strides = {4, 1}; + + Tensor* view_tensor = nullptr; AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, + orig_tensor, new_sizes.size(), new_sizes.data(), new_strides.data(), - 0, // storage_offset - &reinterpreted_tensor); + 0, + &view_tensor); EXPECT_EQ(error, Error::Ok); - ASSERT_NE(reinterpreted_tensor, nullptr); - - // Check that the reinterpreted tensor has the new shape - EXPECT_EQ(reinterpreted_tensor->dim(), 2); - EXPECT_EQ(reinterpreted_tensor->size(0), 2); - EXPECT_EQ(reinterpreted_tensor->size(1), 3); - - // Verify the dtype is preserved as bf16 - int32_t reinterpreted_dtype = 0; - dtype_check_error = - aoti_torch_get_dtype(reinterpreted_tensor, &reinterpreted_dtype); - EXPECT_EQ(dtype_check_error, Error::Ok); - EXPECT_EQ( - reinterpreted_dtype, static_cast(SupportedDTypes::BFLOAT16)) - << "Reinterpreted tensor should preserve bfloat16 dtype"; - - // CRITICAL: Check that the reinterpreted tensor uses the SAME memory - void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr(); - EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr) - << "Reinterpreted tensor should use the same memory as the source tensor"; - - // Test memory sharing by writing data through the original tensor - // and verifying it's visible through the reinterpreted tensor - // Note: bf16 has 2 bytes per element - std::vector test_data_bf16 = { - 0x3F80, 0x4000, 0x4040, 0x4080, 0x40A0, 0x40C0}; // bf16 values - cudaError_t cuda_err = cudaMemcpy( - original_data_ptr, - test_data_bf16.data(), - test_data_bf16.size() * sizeof(uint16_t), - cudaMemcpyHostToDevice); - EXPECT_EQ(cuda_err, cudaSuccess); - - // Read back through the reinterpreted tensor - std::vector readback_data_bf16(6); - cuda_err = cudaMemcpy( - readback_data_bf16.data(), - reinterpreted_data_ptr, - readback_data_bf16.size() * sizeof(uint16_t), - cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess); - - // Verify the data matches - for (size_t i = 0; i < test_data_bf16.size(); i++) { - EXPECT_EQ(readback_data_bf16[i], test_data_bf16[i]) - << "BF16 data should be the same through both tensors at index " << i; - } + ASSERT_NE(view_tensor, nullptr); + EXPECT_TRUE(view_tensor->is_cuda()); + + EXPECT_EQ(view_tensor->dim(), 2); + EXPECT_EQ(view_tensor->size(0), 6); + EXPECT_EQ(view_tensor->size(1), 4); + + EXPECT_EQ(view_tensor->data_ptr(), orig_tensor->data_ptr()); + + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); } -// Test reference counting behavior - memory not in map should fail -TEST_F(AOTITorchReinterpretTensorTest, MemoryNotInMapShouldFail) { - // Create a tensor directly without using our allocation functions - // This should NOT be in the reference counting map - void* external_memory; - ASSERT_EQ( - cudaMallocManaged(&external_memory, 12 * sizeof(float)), cudaSuccess); - - // Create a tensor by manually wrapping this memory without going through our - // APIs - std::vector sizes = {12}; - std::vector strides = calculate_contiguous_strides(sizes); - - // Create the tensor directly using ExecutorTorch extension - auto tensor_shared = executorch::extension::from_blob( - external_memory, - convert_sizes_to_vector(sizes.size(), sizes.data()), - convert_strides_to_vector(sizes.size(), sizes.data(), strides.data()), - executorch::runtime::etensor::ScalarType::Float); - - ASSERT_TRUE(tensor_shared); - Tensor* external_tensor = tensor_shared.get(); - - // Try to reinterpret this tensor - should fail because memory is not in map - std::vector new_sizes = {3, 4}; - std::vector new_strides = calculate_contiguous_strides(new_sizes); - - Tensor* reinterpreted_tensor; +TEST_F(AOTITorchReinterpretTensorSlimTest, WithStorageOffset_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {4, 4}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(orig_tensor, nullptr); + + std::vector new_sizes = {2, 4}; + std::vector new_strides = {4, 1}; + int64_t storage_offset = 8; + + Tensor* view_tensor = nullptr; AOTITorchError error = aoti_torch__reinterpret_tensor( - external_tensor, + orig_tensor, new_sizes.size(), new_sizes.data(), new_strides.data(), - 0, // storage_offset - &reinterpreted_tensor); + storage_offset, + &view_tensor); - // Should fail because memory is not being tracked by reference counting - // system - EXPECT_EQ(error, Error::InvalidArgument); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(view_tensor, nullptr); + EXPECT_TRUE(view_tensor->is_cuda()); + + char* orig_ptr = static_cast(orig_tensor->data_ptr()); + char* view_ptr = static_cast(view_tensor->data_ptr()); + EXPECT_EQ(view_ptr, orig_ptr + storage_offset * sizeof(float)); - // Clean up the external memory - ASSERT_EQ(cudaFree(external_memory), cudaSuccess); + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); } -// Test reference counting behavior - creating view increments reference count -TEST_F(AOTITorchReinterpretTensorTest, ViewCreationIncrementsReferenceCount) { - // Create a source tensor that owns memory (reference count = 1) - std::vector source_sizes = {12}; - Tensor* source_tensor = create_source_tensor(source_sizes); - ASSERT_NE(source_tensor, nullptr); +TEST_F(AOTITorchReinterpretTensorSlimTest, MemorySharing_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } - void* shared_data_ptr = source_tensor->mutable_data_ptr(); - ASSERT_NE(shared_data_ptr, nullptr); + std::vector sizes = {6}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(orig_tensor, nullptr); - // Create first view - should increment reference count to 2 - std::vector view1_sizes = {3, 4}; - std::vector view1_strides = - calculate_contiguous_strides(view1_sizes); + void* orig_ptr = orig_tensor->data_ptr(); - Tensor* view1_tensor; - AOTITorchError error = aoti_torch__reinterpret_tensor( - source_tensor, - view1_sizes.size(), - view1_sizes.data(), - view1_strides.data(), - 0, - &view1_tensor); + std::vector new_sizes = {2, 3}; + std::vector new_strides = {3, 1}; - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view1_tensor, nullptr); - EXPECT_EQ(view1_tensor->mutable_data_ptr(), shared_data_ptr); - - // Create second view - should increment reference count to 3 - std::vector view2_sizes = {2, 6}; - std::vector view2_strides = - calculate_contiguous_strides(view2_sizes); - - Tensor* view2_tensor; - error = aoti_torch__reinterpret_tensor( - source_tensor, - view2_sizes.size(), - view2_sizes.data(), - view2_strides.data(), + Tensor* view_tensor = nullptr; + AOTITorchError error = aoti_torch__reinterpret_tensor( + orig_tensor, + new_sizes.size(), + new_sizes.data(), + new_strides.data(), 0, - &view2_tensor); + &view_tensor); EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view2_tensor, nullptr); - EXPECT_EQ(view2_tensor->mutable_data_ptr(), shared_data_ptr); + ASSERT_NE(view_tensor, nullptr); - // Now delete the source tensor - memory should NOT be freed (reference count - // = 2) - error = aoti_torch_delete_tensor_object(source_tensor); - EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(view_tensor->data_ptr(), orig_ptr); - // Both views should still be valid - test by accessing memory - float test_value = 42.0f; - cudaError_t cuda_err = cudaMemcpy( - shared_data_ptr, &test_value, sizeof(float), cudaMemcpyHostToDevice); - EXPECT_EQ(cuda_err, cudaSuccess); - - float readback_value = 0.0f; - cuda_err = cudaMemcpy( - &readback_value, - view1_tensor->mutable_data_ptr(), - sizeof(float), - cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess); - EXPECT_EQ(readback_value, test_value); - - // Delete first view - memory should still NOT be freed (reference count = 1) - error = aoti_torch_delete_tensor_object(view1_tensor); - EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(view_tensor->data_ptr(), orig_ptr); - // Second view should still be valid - readback_value = 0.0f; - cuda_err = cudaMemcpy( - &readback_value, - view2_tensor->mutable_data_ptr(), - sizeof(float), - cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess); - EXPECT_EQ(readback_value, test_value); - - // Delete second view - NOW memory should be freed (reference count = 0) - error = aoti_torch_delete_tensor_object(view2_tensor); - EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); } -// Test reference counting behavior with NOT_OWN memory (from blob) - should -// SUCCEED and keep NOT_OWN -TEST_F(AOTITorchReinterpretTensorTest, ViewOfNotOwnMemoryKeepsNotOwnStatus) { - // Allocate external memory - void* external_memory; - cudaError_t cuda_err = - cudaMallocManaged(&external_memory, 12 * sizeof(float)); - ASSERT_EQ(cuda_err, cudaSuccess); - - // Create tensor from blob (which marks memory as NOT_OWN) - std::vector blob_sizes = {12}; - std::vector blob_strides = calculate_contiguous_strides(blob_sizes); - - Tensor* blob_tensor; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - external_memory, - blob_sizes.size(), - blob_sizes.data(), - blob_strides.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device_index - &blob_tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size +TEST_F(AOTITorchReinterpretTensorSlimTest, ChainedViews_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {24}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(orig_tensor, nullptr); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(blob_tensor, nullptr); - - // Create view of NOT_OWN memory - should SUCCEED and keep NOT_OWN status - std::vector view_sizes = {3, 4}; - std::vector view_strides = calculate_contiguous_strides(view_sizes); - - Tensor* view_tensor; - error = aoti_torch__reinterpret_tensor( - blob_tensor, - view_sizes.size(), - view_sizes.data(), - view_strides.data(), - 0, - &view_tensor); + void* orig_ptr = orig_tensor->data_ptr(); - // Should succeed - NOT_OWN memory can be reinterpreted but stays NOT_OWN - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - EXPECT_EQ(view_tensor->mutable_data_ptr(), external_memory); - - // Verify both tensors share the same memory - EXPECT_EQ(blob_tensor->mutable_data_ptr(), view_tensor->mutable_data_ptr()); - - // Test memory sharing by writing data through one tensor and reading through - // the other - float test_value = 42.0f; - cuda_err = cudaMemcpy( - external_memory, &test_value, sizeof(float), cudaMemcpyHostToDevice); - EXPECT_EQ(cuda_err, cudaSuccess); - - float readback_value = 0.0f; - cuda_err = cudaMemcpy( - &readback_value, - view_tensor->mutable_data_ptr(), - sizeof(float), - cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess); - EXPECT_EQ(readback_value, test_value); - - // Delete the blob tensor - external memory should NOT be freed (NOT_OWN - // behavior) - error = aoti_torch_delete_tensor_object(blob_tensor); - EXPECT_EQ(error, Error::Ok); + std::vector sizes1 = {4, 6}; + std::vector strides1 = {6, 1}; - // View tensor should still be valid - test by accessing memory - readback_value = 0.0f; - cuda_err = cudaMemcpy( - &readback_value, - view_tensor->mutable_data_ptr(), - sizeof(float), - cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess); - EXPECT_EQ(readback_value, test_value); - - // Delete view tensor - external memory should still NOT be freed (NOT_OWN - // behavior) - error = aoti_torch_delete_tensor_object(view_tensor); - EXPECT_EQ(error, Error::Ok); + Tensor* view1 = nullptr; + EXPECT_EQ( + aoti_torch__reinterpret_tensor( + orig_tensor, + sizes1.size(), + sizes1.data(), + strides1.data(), + 0, + &view1), + Error::Ok); + + std::vector sizes2 = {2, 2, 6}; + std::vector strides2 = {12, 6, 1}; + + Tensor* view2 = nullptr; + EXPECT_EQ( + aoti_torch__reinterpret_tensor( + view1, sizes2.size(), sizes2.data(), strides2.data(), 0, &view2), + Error::Ok); - // External memory should still be accessible (proves neither tensor freed it) - readback_value = 0.0f; - cuda_err = cudaMemcpy( - &readback_value, external_memory, sizeof(float), cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess); - EXPECT_EQ(readback_value, test_value); + EXPECT_EQ(view1->data_ptr(), orig_ptr); + EXPECT_EQ(view2->data_ptr(), orig_ptr); - // Clean up external memory manually (as expected for NOT_OWN memory) - ASSERT_EQ(cudaFree(external_memory), cudaSuccess); + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view1), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(view2), Error::Ok); } diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor_slim.cpp deleted file mode 100644 index d2ad645136e..00000000000 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor_slim.cpp +++ /dev/null @@ -1,692 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -using namespace executorch::backends::cuda; -using executorch::runtime::Error; - -namespace slim_c10 = executorch::backends::aoti::slim::c10; - -namespace { - -bool isCudaAvailable() { - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - return (err == cudaSuccess && device_count > 0); -} - -std::vector calculateContiguousStrides( - const std::vector& sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) { - return strides; - } - strides[sizes.size() - 1] = 1; - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - return strides; -} - -} // namespace - -class AOTITorchReinterpretTensorSlimTest : public ::testing::Test { - protected: - void SetUp() override { - et_pal_init(); - } - - Tensor* createTestTensor( - const std::vector& sizes, - const std::vector& strides = {}, - int32_t dtype = static_cast(slim_c10::ScalarType::Float), - int32_t device_type = static_cast(slim_c10::DeviceType::CPU), - int32_t device_index = 0) { - Tensor* tensor = nullptr; - - std::vector effective_strides = strides; - if (strides.empty()) { - effective_strides = calculateContiguousStrides(sizes); - } - - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - effective_strides.data(), - dtype, - device_type, - device_index, - &tensor); - - return (error == Error::Ok) ? tensor : nullptr; - } -}; - -// ============================================================================ -// Basic Functionality Tests -// ============================================================================ - -TEST_F(AOTITorchReinterpretTensorSlimTest, BasicView_CPU) { - std::vector sizes = {2, 3, 4}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - std::vector new_sizes = {6, 4}; - std::vector new_strides = {4, 1}; - int64_t storage_offset = 0; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - storage_offset, - &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - - EXPECT_EQ(view_tensor->dim(), 2); - EXPECT_EQ(view_tensor->size(0), 6); - EXPECT_EQ(view_tensor->size(1), 4); - EXPECT_EQ(view_tensor->stride(0), 4); - EXPECT_EQ(view_tensor->stride(1), 1); - - EXPECT_EQ(view_tensor->data_ptr(), orig_tensor->data_ptr()); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -TEST_F(AOTITorchReinterpretTensorSlimTest, NullSelf) { - std::vector sizes = {2, 3}; - std::vector strides = {3, 1}; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - nullptr, sizes.size(), sizes.data(), strides.data(), 0, &view_tensor); - - EXPECT_EQ(error, Error::InvalidArgument); -} - -TEST_F(AOTITorchReinterpretTensorSlimTest, NullReturnPointer) { - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - std::vector new_sizes = {6}; - std::vector new_strides = {1}; - - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - 0, - nullptr); - - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); -} - -TEST_F(AOTITorchReinterpretTensorSlimTest, NegativeNdim) { - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - std::vector new_sizes = {6}; - std::vector new_strides = {1}; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, -1, new_sizes.data(), new_strides.data(), 0, &view_tensor); - - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); -} - -// ============================================================================ -// Storage Offset Tests -// ============================================================================ - -TEST_F(AOTITorchReinterpretTensorSlimTest, WithStorageOffset_CPU) { - std::vector sizes = {4, 4}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - std::vector new_sizes = {2, 4}; - std::vector new_strides = {4, 1}; - int64_t storage_offset = 4; // Skip first row - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - storage_offset, - &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - - EXPECT_EQ(view_tensor->dim(), 2); - EXPECT_EQ(view_tensor->size(0), 2); - EXPECT_EQ(view_tensor->size(1), 4); - - char* orig_ptr = static_cast(orig_tensor->data_ptr()); - char* view_ptr = static_cast(view_tensor->data_ptr()); - EXPECT_EQ(view_ptr, orig_ptr + storage_offset * sizeof(float)); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -// ============================================================================ -// Memory Sharing Tests -// ============================================================================ - -TEST_F(AOTITorchReinterpretTensorSlimTest, MemorySharing_CPU) { - std::vector sizes = {6}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - - std::vector new_sizes = {2, 3}; - std::vector new_strides = {3, 1}; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - 0, - &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - - EXPECT_EQ(view_tensor->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - - EXPECT_EQ(view_tensor->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -TEST_F(AOTITorchReinterpretTensorSlimTest, MultipleViews_CPU) { - std::vector sizes = {24}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - - std::vector sizes1 = {2, 12}; - std::vector strides1 = {12, 1}; - - std::vector sizes2 = {4, 6}; - std::vector strides2 = {6, 1}; - - std::vector sizes3 = {2, 3, 4}; - std::vector strides3 = {12, 4, 1}; - - Tensor* view1 = nullptr; - Tensor* view2 = nullptr; - Tensor* view3 = nullptr; - - EXPECT_EQ( - aoti_torch__reinterpret_tensor( - orig_tensor, - sizes1.size(), - sizes1.data(), - strides1.data(), - 0, - &view1), - Error::Ok); - EXPECT_EQ( - aoti_torch__reinterpret_tensor( - orig_tensor, - sizes2.size(), - sizes2.data(), - strides2.data(), - 0, - &view2), - Error::Ok); - EXPECT_EQ( - aoti_torch__reinterpret_tensor( - orig_tensor, - sizes3.size(), - sizes3.data(), - strides3.data(), - 0, - &view3), - Error::Ok); - - EXPECT_EQ(view1->data_ptr(), orig_ptr); - EXPECT_EQ(view2->data_ptr(), orig_ptr); - EXPECT_EQ(view3->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - - EXPECT_EQ(view1->data_ptr(), orig_ptr); - EXPECT_EQ(view2->data_ptr(), orig_ptr); - EXPECT_EQ(view3->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(view1), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view2), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view3), Error::Ok); -} - -// ============================================================================ -// Dimension Change Tests -// ============================================================================ - -TEST_F(AOTITorchReinterpretTensorSlimTest, ExpandDimensions_CPU) { - std::vector sizes = {6}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - EXPECT_EQ(orig_tensor->dim(), 1); - - std::vector new_sizes = {2, 3}; - std::vector new_strides = {3, 1}; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - 0, - &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - EXPECT_EQ(view_tensor->dim(), 2); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -TEST_F(AOTITorchReinterpretTensorSlimTest, CollapseDimensions_CPU) { - std::vector sizes = {2, 3, 4}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - EXPECT_EQ(orig_tensor->dim(), 3); - - std::vector new_sizes = {24}; - std::vector new_strides = {1}; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - 0, - &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - EXPECT_EQ(view_tensor->dim(), 1); - EXPECT_EQ(view_tensor->numel(), 24); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -TEST_F(AOTITorchReinterpretTensorSlimTest, ScalarTensorView_CPU) { - std::vector sizes = {1}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - std::vector new_sizes = {}; - std::vector new_strides = {}; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, 0, new_sizes.data(), new_strides.data(), 0, &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - EXPECT_EQ(view_tensor->dim(), 0); - EXPECT_EQ(view_tensor->numel(), 1); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -// ============================================================================ -// Stride Tests -// ============================================================================ - -TEST_F(AOTITorchReinterpretTensorSlimTest, TransposeViaStrides_CPU) { - std::vector sizes = {3, 4}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - std::vector new_sizes = {4, 3}; - std::vector new_strides = {1, 4}; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - 0, - &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - EXPECT_EQ(view_tensor->size(0), 4); - EXPECT_EQ(view_tensor->size(1), 3); - EXPECT_EQ(view_tensor->stride(0), 1); - EXPECT_EQ(view_tensor->stride(1), 4); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -// ============================================================================ -// Different Dtype Tests -// ============================================================================ - -TEST_F(AOTITorchReinterpretTensorSlimTest, Int64Tensor_CPU) { - std::vector sizes = {6}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Long), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - std::vector new_sizes = {2, 3}; - std::vector new_strides = {3, 1}; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - 0, - &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - EXPECT_EQ(view_tensor->itemsize(), 8); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -TEST_F(AOTITorchReinterpretTensorSlimTest, BFloat16Tensor_CPU) { - std::vector sizes = {6}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::BFloat16), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - std::vector new_sizes = {2, 3}; - std::vector new_strides = {3, 1}; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - 0, - &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - EXPECT_EQ(view_tensor->itemsize(), 2); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -// ============================================================================ -// CUDA Tests -// ============================================================================ - -TEST_F(AOTITorchReinterpretTensorSlimTest, BasicView_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3, 4}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(orig_tensor, nullptr); - EXPECT_TRUE(orig_tensor->is_cuda()); - - std::vector new_sizes = {6, 4}; - std::vector new_strides = {4, 1}; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - 0, - &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - EXPECT_TRUE(view_tensor->is_cuda()); - - EXPECT_EQ(view_tensor->dim(), 2); - EXPECT_EQ(view_tensor->size(0), 6); - EXPECT_EQ(view_tensor->size(1), 4); - - EXPECT_EQ(view_tensor->data_ptr(), orig_tensor->data_ptr()); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -TEST_F(AOTITorchReinterpretTensorSlimTest, WithStorageOffset_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {4, 4}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(orig_tensor, nullptr); - - std::vector new_sizes = {2, 4}; - std::vector new_strides = {4, 1}; - int64_t storage_offset = 8; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - storage_offset, - &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - EXPECT_TRUE(view_tensor->is_cuda()); - - char* orig_ptr = static_cast(orig_tensor->data_ptr()); - char* view_ptr = static_cast(view_tensor->data_ptr()); - EXPECT_EQ(view_ptr, orig_ptr + storage_offset * sizeof(float)); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -TEST_F(AOTITorchReinterpretTensorSlimTest, MemorySharing_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {6}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - - std::vector new_sizes = {2, 3}; - std::vector new_strides = {3, 1}; - - Tensor* view_tensor = nullptr; - AOTITorchError error = aoti_torch__reinterpret_tensor( - orig_tensor, - new_sizes.size(), - new_sizes.data(), - new_strides.data(), - 0, - &view_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(view_tensor, nullptr); - - EXPECT_EQ(view_tensor->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(view_tensor->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(view_tensor), Error::Ok); -} - -TEST_F(AOTITorchReinterpretTensorSlimTest, ChainedViews_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {24}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - - std::vector sizes1 = {4, 6}; - std::vector strides1 = {6, 1}; - - Tensor* view1 = nullptr; - EXPECT_EQ( - aoti_torch__reinterpret_tensor( - orig_tensor, - sizes1.size(), - sizes1.data(), - strides1.data(), - 0, - &view1), - Error::Ok); - - std::vector sizes2 = {2, 2, 6}; - std::vector strides2 = {12, 6, 1}; - - Tensor* view2 = nullptr; - EXPECT_EQ( - aoti_torch__reinterpret_tensor( - view1, sizes2.size(), sizes2.data(), strides2.data(), 0, &view2), - Error::Ok); - - EXPECT_EQ(view1->data_ptr(), orig_ptr); - EXPECT_EQ(view2->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view1), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(view2), Error::Ok); -} diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out.cpp index d5e1bcb2547..f01743745d2 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out.cpp @@ -7,239 +7,431 @@ */ #include -#include -#include -#include -#include -#include -#include #include #include -using namespace executorch::backends::aoti; +#include +#include +#include +#include +#include + using namespace executorch::backends::cuda; -using namespace executorch::runtime; -using executorch::runtime::etensor::Tensor; +using executorch::runtime::Error; -// Test fixture for aoti_torch_assign_tensors_out tests -class AOTITorchAssignTensorsOutTest : public ::testing::Test { - protected: - void SetUp() override { - // Initialize ExecuTorch Platform Abstraction Layer - et_pal_init(); +namespace slim_c10 = executorch::backends::aoti::slim::c10; - // Check if CUDA is available - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - if (err != cudaSuccess || device_count == 0) { - GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; - } +namespace { - // Clean up any existing cached metadata before each test - cleanup_tensor_metadata(); +bool isCudaAvailable() { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + return (err == cudaSuccess && device_count > 0); +} - // Clear any remaining tensors from previous tests - clear_all_tensors(); +std::vector calculateContiguousStrides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; + } + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; } + return strides; +} - void TearDown() override { - // Clean up metadata - cleanup_tensor_metadata(); +} // namespace - // Clear the global tensor storage using the provided function - clear_all_tensors(); +class AOTITorchAssignTensorsOutSlimTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); } - // Helper to create a test tensor - Tensor* create_test_tensor( + Tensor* createTestTensor( const std::vector& sizes, - int32_t dtype = static_cast(SupportedDTypes::FLOAT32), - int32_t device_type = static_cast(SupportedDevices::CUDA)) { - std::vector strides; - // Calculate contiguous strides - if (!sizes.empty()) { - strides.resize(sizes.size()); - strides[sizes.size() - 1] = 1; - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } + const std::vector& strides = {}, + int32_t dtype = static_cast(slim_c10::ScalarType::Float), + int32_t device_type = static_cast(slim_c10::DeviceType::CPU), + int32_t device_index = 0) { + Tensor* tensor = nullptr; + + std::vector effective_strides = strides; + if (strides.empty()) { + effective_strides = calculateContiguousStrides(sizes); } - Tensor* tensor; - const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data(); - AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), - strides_ptr, + effective_strides.data(), dtype, device_type, - 0, + device_index, &tensor); return (error == Error::Ok) ? tensor : nullptr; } }; -// Test basic functionality -TEST_F(AOTITorchAssignTensorsOutTest, BasicFunctionality) { - // Create a source tensor - std::vector sizes = {2, 3}; - Tensor* src = create_test_tensor(sizes); - ASSERT_NE(src, nullptr); +// ============================================================================ +// Basic Functionality Tests +// ============================================================================ - // Create output tensor handle - Tensor* dst = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst); +TEST_F(AOTITorchAssignTensorsOutSlimTest, BasicFunctionality_CPU) { + std::vector sizes = {2, 3}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + // Store expected properties before move + int64_t expected_dim = src_tensor->dim(); + int64_t expected_size0 = src_tensor->size(0); + int64_t expected_size1 = src_tensor->size(1); + size_t expected_numel = src_tensor->numel(); + void* expected_data_ptr = src_tensor->data_ptr(); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(dst, nullptr); + ASSERT_NE(dst_tensor, nullptr); + + // Verify destination tensor has the moved properties + EXPECT_EQ(dst_tensor->dim(), expected_dim); + EXPECT_EQ(dst_tensor->size(0), expected_size0); + EXPECT_EQ(dst_tensor->size(1), expected_size1); + EXPECT_EQ(dst_tensor->numel(), expected_numel); + EXPECT_EQ(dst_tensor->data_ptr(), expected_data_ptr); + + // Source tensor is now in undefined state after move - just delete it + // (accessing src_tensor properties is undefined behavior after move) + delete src_tensor; // Direct delete since it's in undefined state + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +TEST_F(AOTITorchAssignTensorsOutSlimTest, NullSrc) { + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(nullptr, &dst_tensor); - // Verify the output tensor has the same properties as source - EXPECT_EQ(dst->dim(), src->dim()); - EXPECT_EQ(dst->size(0), src->size(0)); - EXPECT_EQ(dst->size(1), src->size(1)); - EXPECT_EQ(dst->numel(), src->numel()); + EXPECT_EQ(error, Error::InvalidArgument); +} + +TEST_F(AOTITorchAssignTensorsOutSlimTest, NullDst) { + std::vector sizes = {2, 3}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, nullptr); - // Verify they share the same memory - EXPECT_EQ(dst->mutable_data_ptr(), src->mutable_data_ptr()); + EXPECT_EQ(error, Error::InvalidArgument); + + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); } -// Test with 1D tensor -TEST_F(AOTITorchAssignTensorsOutTest, OneDimensionalTensor) { - std::vector sizes = {10}; - Tensor* src = create_test_tensor(sizes); - ASSERT_NE(src, nullptr); +// ============================================================================ +// Move Semantics Tests +// ============================================================================ - Tensor* dst = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst); +TEST_F(AOTITorchAssignTensorsOutSlimTest, SourceBecamesUndefinedAfterMove_CPU) { + std::vector sizes = {3, 4}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + void* original_ptr = src_tensor->data_ptr(); + ASSERT_NE(original_ptr, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(dst, nullptr); - EXPECT_EQ(dst->dim(), 1); - EXPECT_EQ(dst->size(0), 10); - EXPECT_EQ(dst->mutable_data_ptr(), src->mutable_data_ptr()); -} + ASSERT_NE(dst_tensor, nullptr); -// Test with 3D tensor -TEST_F(AOTITorchAssignTensorsOutTest, ThreeDimensionalTensor) { - std::vector sizes = {2, 3, 4}; - Tensor* src = create_test_tensor(sizes); - ASSERT_NE(src, nullptr); + // Destination has the original pointer + EXPECT_EQ(dst_tensor->data_ptr(), original_ptr); + + // Source tensor is now in undefined state - verify it's no longer defined + EXPECT_FALSE(src_tensor->defined()); - Tensor* dst = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst); + // Clean up - delete in this order since src is undefined + delete src_tensor; + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} +// ============================================================================ +// Tensor Property Tests +// ============================================================================ + +TEST_F(AOTITorchAssignTensorsOutSlimTest, CustomStrides_CPU) { + std::vector sizes = {3, 4}; + std::vector strides = {4, 1}; + Tensor* src_tensor = createTestTensor( + sizes, + strides, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + // Store expected strides before move + int64_t expected_stride0 = src_tensor->stride(0); + int64_t expected_stride1 = src_tensor->stride(1); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(dst, nullptr); - EXPECT_EQ(dst->dim(), 3); - EXPECT_EQ(dst->size(0), 2); - EXPECT_EQ(dst->size(1), 3); - EXPECT_EQ(dst->size(2), 4); - EXPECT_EQ(dst->mutable_data_ptr(), src->mutable_data_ptr()); + ASSERT_NE(dst_tensor, nullptr); + + // Verify destination has the expected strides + EXPECT_EQ(dst_tensor->stride(0), expected_stride0); + EXPECT_EQ(dst_tensor->stride(1), expected_stride1); + + delete src_tensor; // Source is undefined after move + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); } -// Test with scalar (0D) tensor -TEST_F(AOTITorchAssignTensorsOutTest, ScalarTensor) { +TEST_F(AOTITorchAssignTensorsOutSlimTest, ScalarTensor_CPU) { std::vector sizes = {}; - Tensor* src = create_test_tensor(sizes); - ASSERT_NE(src, nullptr); + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + EXPECT_EQ(src_tensor->dim(), 0); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); - Tensor* dst = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst); + EXPECT_EQ(dst_tensor->dim(), 0); + EXPECT_EQ(dst_tensor->numel(), 1); - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(dst, nullptr); - EXPECT_EQ(dst->dim(), 0); - EXPECT_EQ(dst->mutable_data_ptr(), src->mutable_data_ptr()); + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); } -// Test with null source pointer -TEST_F(AOTITorchAssignTensorsOutTest, NullSourcePointer) { - Tensor* dst = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(nullptr, &dst); - EXPECT_EQ(error, Error::InvalidArgument); +TEST_F(AOTITorchAssignTensorsOutSlimTest, LargeMultiDimensionalTensor_CPU) { + std::vector sizes = {10, 20, 30}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + + EXPECT_EQ(dst_tensor->dim(), 3); + EXPECT_EQ(dst_tensor->size(0), 10); + EXPECT_EQ(dst_tensor->size(1), 20); + EXPECT_EQ(dst_tensor->size(2), 30); + EXPECT_EQ(dst_tensor->numel(), 6000); + + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); } -// Test with null destination pointer -TEST_F(AOTITorchAssignTensorsOutTest, NullDestinationPointer) { +// ============================================================================ +// Different Dtype Tests +// ============================================================================ + +TEST_F(AOTITorchAssignTensorsOutSlimTest, Int64Tensor_CPU) { std::vector sizes = {2, 3}; - Tensor* src = create_test_tensor(sizes); - ASSERT_NE(src, nullptr); + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Long), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); - AOTITorchError error = aoti_torch_assign_tensors_out(src, nullptr); - EXPECT_EQ(error, Error::InvalidArgument); + EXPECT_EQ(dst_tensor->itemsize(), 8); + + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); } -// Test that strides are preserved -TEST_F(AOTITorchAssignTensorsOutTest, StridesPreserved) { - std::vector sizes = {2, 3}; - Tensor* src = create_test_tensor(sizes); - ASSERT_NE(src, nullptr); +TEST_F(AOTITorchAssignTensorsOutSlimTest, BFloat16Tensor_CPU) { + std::vector sizes = {2, 3, 4}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::BFloat16), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); - Tensor* dst = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst); + EXPECT_EQ(dst_tensor->itemsize(), 2); + + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} +TEST_F(AOTITorchAssignTensorsOutSlimTest, BoolTensor_CPU) { + std::vector sizes = {4}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Bool), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(dst, nullptr); + ASSERT_NE(dst_tensor, nullptr); - // Get strides from both tensors - int64_t* src_strides; - int64_t* dst_strides; - aoti_torch_get_strides(src, &src_strides); - aoti_torch_get_strides(dst, &dst_strides); + EXPECT_EQ(dst_tensor->itemsize(), 1); - // Verify strides match - for (int64_t i = 0; i < src->dim(); i++) { - EXPECT_EQ(src_strides[i], dst_strides[i]); - } + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); } -// Test with CPU tensor -TEST_F(AOTITorchAssignTensorsOutTest, CPUTensor) { +// ============================================================================ +// CUDA Tests +// ============================================================================ + +TEST_F(AOTITorchAssignTensorsOutSlimTest, BasicFunctionality_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + std::vector sizes = {2, 3}; - Tensor* src = create_test_tensor( + Tensor* src_tensor = createTestTensor( sizes, - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CPU)); - ASSERT_NE(src, nullptr); + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(src_tensor, nullptr); + EXPECT_TRUE(src_tensor->is_cuda()); - Tensor* dst = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst); + // Store expected properties before move + void* expected_data_ptr = src_tensor->data_ptr(); + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + EXPECT_TRUE(dst_tensor->is_cuda()); + EXPECT_EQ(dst_tensor->data_ptr(), expected_data_ptr); + + // Source is undefined after move + EXPECT_FALSE(src_tensor->defined()); + + delete src_tensor; + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +TEST_F( + AOTITorchAssignTensorsOutSlimTest, + SourceBecamesUndefinedAfterMove_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {3, 4}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(src_tensor, nullptr); + + void* original_ptr = src_tensor->data_ptr(); + ASSERT_NE(original_ptr, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(dst, nullptr); - EXPECT_EQ(dst->mutable_data_ptr(), src->mutable_data_ptr()); + ASSERT_NE(dst_tensor, nullptr); + + // Destination has the original pointer + EXPECT_EQ(dst_tensor->data_ptr(), original_ptr); + + // Source tensor is now in undefined state + EXPECT_FALSE(src_tensor->defined()); + + delete src_tensor; + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); } -// Test dtype is preserved -TEST_F(AOTITorchAssignTensorsOutTest, DtypePreserved) { - // Test with different dtypes - std::vector dtypes = { - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDTypes::INT32), - static_cast(SupportedDTypes::INT64), - }; - - for (int32_t dtype : dtypes) { - cleanup_tensor_metadata(); - clear_all_tensors(); - - std::vector sizes = {2, 3}; - Tensor* src = create_test_tensor(sizes, dtype); - ASSERT_NE(src, nullptr); - - Tensor* dst = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src, &dst); - - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(dst, nullptr); - - // Verify dtype is preserved - int32_t src_dtype, dst_dtype; - aoti_torch_get_dtype(src, &src_dtype); - aoti_torch_get_dtype(dst, &dst_dtype); - EXPECT_EQ(src_dtype, dst_dtype) - << "Dtype mismatch for dtype code: " << dtype; +// ============================================================================ +// Mixed Device Tests +// ============================================================================ + +TEST_F(AOTITorchAssignTensorsOutSlimTest, MixedDeviceAssignments) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; } + + std::vector sizes = {2, 3}; + + Tensor* cpu_src = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(cpu_src, nullptr); + EXPECT_TRUE(cpu_src->is_cpu()); + + Tensor* cuda_src = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(cuda_src, nullptr); + EXPECT_TRUE(cuda_src->is_cuda()); + + Tensor* cpu_dst = nullptr; + Tensor* cuda_dst = nullptr; + + EXPECT_EQ(aoti_torch_assign_tensors_out(cpu_src, &cpu_dst), Error::Ok); + EXPECT_EQ(aoti_torch_assign_tensors_out(cuda_src, &cuda_dst), Error::Ok); + + EXPECT_TRUE(cpu_dst->is_cpu()); + EXPECT_TRUE(cuda_dst->is_cuda()); + EXPECT_NE(cpu_dst->data_ptr(), cuda_dst->data_ptr()); + + EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_dst), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_dst), Error::Ok); } diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out_slim.cpp deleted file mode 100644 index f01743745d2..00000000000 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out_slim.cpp +++ /dev/null @@ -1,437 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -using namespace executorch::backends::cuda; -using executorch::runtime::Error; - -namespace slim_c10 = executorch::backends::aoti::slim::c10; - -namespace { - -bool isCudaAvailable() { - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - return (err == cudaSuccess && device_count > 0); -} - -std::vector calculateContiguousStrides( - const std::vector& sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) { - return strides; - } - strides[sizes.size() - 1] = 1; - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - return strides; -} - -} // namespace - -class AOTITorchAssignTensorsOutSlimTest : public ::testing::Test { - protected: - void SetUp() override { - et_pal_init(); - } - - Tensor* createTestTensor( - const std::vector& sizes, - const std::vector& strides = {}, - int32_t dtype = static_cast(slim_c10::ScalarType::Float), - int32_t device_type = static_cast(slim_c10::DeviceType::CPU), - int32_t device_index = 0) { - Tensor* tensor = nullptr; - - std::vector effective_strides = strides; - if (strides.empty()) { - effective_strides = calculateContiguousStrides(sizes); - } - - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - effective_strides.data(), - dtype, - device_type, - device_index, - &tensor); - - return (error == Error::Ok) ? tensor : nullptr; - } -}; - -// ============================================================================ -// Basic Functionality Tests -// ============================================================================ - -TEST_F(AOTITorchAssignTensorsOutSlimTest, BasicFunctionality_CPU) { - std::vector sizes = {2, 3}; - Tensor* src_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src_tensor, nullptr); - - // Store expected properties before move - int64_t expected_dim = src_tensor->dim(); - int64_t expected_size0 = src_tensor->size(0); - int64_t expected_size1 = src_tensor->size(1); - size_t expected_numel = src_tensor->numel(); - void* expected_data_ptr = src_tensor->data_ptr(); - - Tensor* dst_tensor = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(dst_tensor, nullptr); - - // Verify destination tensor has the moved properties - EXPECT_EQ(dst_tensor->dim(), expected_dim); - EXPECT_EQ(dst_tensor->size(0), expected_size0); - EXPECT_EQ(dst_tensor->size(1), expected_size1); - EXPECT_EQ(dst_tensor->numel(), expected_numel); - EXPECT_EQ(dst_tensor->data_ptr(), expected_data_ptr); - - // Source tensor is now in undefined state after move - just delete it - // (accessing src_tensor properties is undefined behavior after move) - delete src_tensor; // Direct delete since it's in undefined state - EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); -} - -TEST_F(AOTITorchAssignTensorsOutSlimTest, NullSrc) { - Tensor* dst_tensor = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(nullptr, &dst_tensor); - - EXPECT_EQ(error, Error::InvalidArgument); -} - -TEST_F(AOTITorchAssignTensorsOutSlimTest, NullDst) { - std::vector sizes = {2, 3}; - Tensor* src_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src_tensor, nullptr); - - AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, nullptr); - - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); -} - -// ============================================================================ -// Move Semantics Tests -// ============================================================================ - -TEST_F(AOTITorchAssignTensorsOutSlimTest, SourceBecamesUndefinedAfterMove_CPU) { - std::vector sizes = {3, 4}; - Tensor* src_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src_tensor, nullptr); - - void* original_ptr = src_tensor->data_ptr(); - ASSERT_NE(original_ptr, nullptr); - - Tensor* dst_tensor = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(dst_tensor, nullptr); - - // Destination has the original pointer - EXPECT_EQ(dst_tensor->data_ptr(), original_ptr); - - // Source tensor is now in undefined state - verify it's no longer defined - EXPECT_FALSE(src_tensor->defined()); - - // Clean up - delete in this order since src is undefined - delete src_tensor; - EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); -} - -// ============================================================================ -// Tensor Property Tests -// ============================================================================ - -TEST_F(AOTITorchAssignTensorsOutSlimTest, CustomStrides_CPU) { - std::vector sizes = {3, 4}; - std::vector strides = {4, 1}; - Tensor* src_tensor = createTestTensor( - sizes, - strides, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src_tensor, nullptr); - - // Store expected strides before move - int64_t expected_stride0 = src_tensor->stride(0); - int64_t expected_stride1 = src_tensor->stride(1); - - Tensor* dst_tensor = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(dst_tensor, nullptr); - - // Verify destination has the expected strides - EXPECT_EQ(dst_tensor->stride(0), expected_stride0); - EXPECT_EQ(dst_tensor->stride(1), expected_stride1); - - delete src_tensor; // Source is undefined after move - EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); -} - -TEST_F(AOTITorchAssignTensorsOutSlimTest, ScalarTensor_CPU) { - std::vector sizes = {}; - Tensor* src_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src_tensor, nullptr); - EXPECT_EQ(src_tensor->dim(), 0); - - Tensor* dst_tensor = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(dst_tensor, nullptr); - - EXPECT_EQ(dst_tensor->dim(), 0); - EXPECT_EQ(dst_tensor->numel(), 1); - - EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); -} - -TEST_F(AOTITorchAssignTensorsOutSlimTest, LargeMultiDimensionalTensor_CPU) { - std::vector sizes = {10, 20, 30}; - Tensor* src_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src_tensor, nullptr); - - Tensor* dst_tensor = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(dst_tensor, nullptr); - - EXPECT_EQ(dst_tensor->dim(), 3); - EXPECT_EQ(dst_tensor->size(0), 10); - EXPECT_EQ(dst_tensor->size(1), 20); - EXPECT_EQ(dst_tensor->size(2), 30); - EXPECT_EQ(dst_tensor->numel(), 6000); - - EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); -} - -// ============================================================================ -// Different Dtype Tests -// ============================================================================ - -TEST_F(AOTITorchAssignTensorsOutSlimTest, Int64Tensor_CPU) { - std::vector sizes = {2, 3}; - Tensor* src_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Long), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src_tensor, nullptr); - - Tensor* dst_tensor = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(dst_tensor, nullptr); - - EXPECT_EQ(dst_tensor->itemsize(), 8); - - EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); -} - -TEST_F(AOTITorchAssignTensorsOutSlimTest, BFloat16Tensor_CPU) { - std::vector sizes = {2, 3, 4}; - Tensor* src_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::BFloat16), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src_tensor, nullptr); - - Tensor* dst_tensor = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(dst_tensor, nullptr); - - EXPECT_EQ(dst_tensor->itemsize(), 2); - - EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); -} - -TEST_F(AOTITorchAssignTensorsOutSlimTest, BoolTensor_CPU) { - std::vector sizes = {4}; - Tensor* src_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Bool), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src_tensor, nullptr); - - Tensor* dst_tensor = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(dst_tensor, nullptr); - - EXPECT_EQ(dst_tensor->itemsize(), 1); - - EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); -} - -// ============================================================================ -// CUDA Tests -// ============================================================================ - -TEST_F(AOTITorchAssignTensorsOutSlimTest, BasicFunctionality_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - Tensor* src_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(src_tensor, nullptr); - EXPECT_TRUE(src_tensor->is_cuda()); - - // Store expected properties before move - void* expected_data_ptr = src_tensor->data_ptr(); - - Tensor* dst_tensor = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(dst_tensor, nullptr); - EXPECT_TRUE(dst_tensor->is_cuda()); - EXPECT_EQ(dst_tensor->data_ptr(), expected_data_ptr); - - // Source is undefined after move - EXPECT_FALSE(src_tensor->defined()); - - delete src_tensor; - EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); -} - -TEST_F( - AOTITorchAssignTensorsOutSlimTest, - SourceBecamesUndefinedAfterMove_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {3, 4}; - Tensor* src_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(src_tensor, nullptr); - - void* original_ptr = src_tensor->data_ptr(); - ASSERT_NE(original_ptr, nullptr); - - Tensor* dst_tensor = nullptr; - AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(dst_tensor, nullptr); - - // Destination has the original pointer - EXPECT_EQ(dst_tensor->data_ptr(), original_ptr); - - // Source tensor is now in undefined state - EXPECT_FALSE(src_tensor->defined()); - - delete src_tensor; - EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); -} - -// ============================================================================ -// Mixed Device Tests -// ============================================================================ - -TEST_F(AOTITorchAssignTensorsOutSlimTest, MixedDeviceAssignments) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - - Tensor* cpu_src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(cpu_src, nullptr); - EXPECT_TRUE(cpu_src->is_cpu()); - - Tensor* cuda_src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(cuda_src, nullptr); - EXPECT_TRUE(cuda_src->is_cuda()); - - Tensor* cpu_dst = nullptr; - Tensor* cuda_dst = nullptr; - - EXPECT_EQ(aoti_torch_assign_tensors_out(cpu_src, &cpu_dst), Error::Ok); - EXPECT_EQ(aoti_torch_assign_tensors_out(cuda_src, &cuda_dst), Error::Ok); - - EXPECT_TRUE(cpu_dst->is_cpu()); - EXPECT_TRUE(cuda_dst->is_cuda()); - EXPECT_NE(cpu_dst->data_ptr(), cuda_dst->data_ptr()); - - EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_src), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_src), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_dst), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_dst), Error::Ok); -} diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp index 9fca0f92cf8..c2e67732b41 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp @@ -7,392 +7,481 @@ */ #include -#include -#include -#include -#include -#include -#include #include -#include #include +#include +#include +#include +#include +#include + using namespace executorch::backends::cuda; -using namespace executorch::backends::aoti; -using namespace executorch::runtime; +using executorch::runtime::Error; -// Test fixture for aoti_torch_copy_ tests -class AOTITorchCopyTest : public ::testing::Test { - protected: - void SetUp() override { - // Initialize ExecuTorch Platform Abstraction Layer - et_pal_init(); +namespace slim_c10 = executorch::backends::aoti::slim::c10; - // Check if CUDA is available - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - if (err != cudaSuccess || device_count == 0) { - GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; - } +namespace { - // Clean up any existing cached metadata before each test - cleanup_tensor_metadata(); +bool isCudaAvailable() { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + return (err == cudaSuccess && device_count > 0); +} - // Clear any remaining tensors from previous tests - clear_all_tensors(); +std::vector calculateContiguousStrides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; + } + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; } + return strides; +} - void TearDown() override { - // Clean up metadata - cleanup_tensor_metadata(); +} // namespace - // Clear the global tensor storage using the provided function - clear_all_tensors(); +class AOTITorchCopySlimTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); } - // Helper to create test tensors with specific data - Tensor* create_test_tensor_with_data( + Tensor* createTestTensor( const std::vector& sizes, - const std::vector& data, const std::vector& strides = {}, - int32_t dtype = static_cast(SupportedDTypes::FLOAT32), - int32_t device_type = static_cast(SupportedDevices::CUDA), + int32_t dtype = static_cast(slim_c10::ScalarType::Float), + int32_t device_type = static_cast(slim_c10::DeviceType::CPU), int32_t device_index = 0) { - Tensor* tensor; + Tensor* tensor = nullptr; - const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data(); + std::vector effective_strides = strides; + if (strides.empty()) { + effective_strides = calculateContiguousStrides(sizes); + } AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), - strides_ptr, + effective_strides.data(), dtype, device_type, device_index, &tensor); - if (error != Error::Ok || tensor == nullptr) { - return nullptr; - } + return (error == Error::Ok) ? tensor : nullptr; + } +}; - // Fill tensor with data - size_t total_bytes = data.size() * sizeof(float); - if (device_type == static_cast(SupportedDevices::CUDA)) { - cudaError_t memcpy_err = cudaMemcpy( - tensor->mutable_data_ptr(), - data.data(), - total_bytes, - cudaMemcpyHostToDevice); - // Note: Error is checked but we don't fail the function - // This allows tests to proceed and handle errors as needed - (void)memcpy_err; // Suppress unused variable warning - } else { // CPU - std::memcpy(tensor->mutable_data_ptr(), data.data(), total_bytes); - } +// ============================================================================ +// Basic Functionality Tests +// ============================================================================ - return tensor; +TEST_F(AOTITorchCopySlimTest, BasicCopy_CPU) { + std::vector sizes = {3, 4}; + Tensor* src = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src, nullptr); + + float* src_data = static_cast(src->data_ptr()); + for (int64_t i = 0; i < src->numel(); i++) { + src_data[i] = static_cast(i + 1); } - // Helper to get data from tensor - std::vector get_tensor_data(Tensor* tensor) { - if (!tensor) { - return {}; - } + Tensor* dst = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(dst, nullptr); - size_t num_elements = tensor->numel(); - std::vector data(num_elements); - - // Determine if this is a CUDA tensor - cudaPointerAttributes attributes{}; - cudaError_t err = cudaPointerGetAttributes(&attributes, tensor->data_ptr()); - bool is_device = - (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice); - - if (is_device) { - cudaError_t memcpy_err = cudaMemcpy( - data.data(), - tensor->data_ptr(), - num_elements * sizeof(float), - cudaMemcpyDeviceToHost); - // Note: Error is checked but we don't fail the function - // This allows tests to proceed and handle errors as needed - (void)memcpy_err; // Suppress unused variable warning - } else { - std::memcpy( - data.data(), tensor->data_ptr(), num_elements * sizeof(float)); - } + AOTITorchError error = aoti_torch_copy_(dst, src, 0); + EXPECT_EQ(error, Error::Ok); - return data; + float* dst_data = static_cast(dst->data_ptr()); + for (int64_t i = 0; i < dst->numel(); i++) { + EXPECT_FLOAT_EQ(dst_data[i], static_cast(i + 1)); } - // Helper to verify two tensors have same data - bool tensors_equal(Tensor* a, Tensor* b, float tolerance = 1e-6f) { - if (!a || !b) { - return false; - } - if (a->numel() != b->numel()) { - return false; - } + EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); +} - auto data_a = get_tensor_data(a); - auto data_b = get_tensor_data(b); +TEST_F(AOTITorchCopySlimTest, NullSelf) { + std::vector sizes = {2, 3}; + Tensor* src = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src, nullptr); - for (size_t i = 0; i < data_a.size(); ++i) { - if (std::abs(data_a[i] - data_b[i]) > tolerance) { - return false; - } - } - return true; - } -}; + AOTITorchError error = aoti_torch_copy_(nullptr, src, 0); + EXPECT_EQ(error, Error::InvalidArgument); -// Test basic copy functionality - same schema (fast path) -TEST_F(AOTITorchCopyTest, BasicCopySameSchema) { - // Create source tensor with test data + EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); +} + +TEST_F(AOTITorchCopySlimTest, NullSrc) { std::vector sizes = {2, 3}; - std::vector src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + Tensor* dst = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(dst, nullptr); - Tensor* src = create_test_tensor_with_data(sizes, src_data); - EXPECT_NE(src, nullptr); + AOTITorchError error = aoti_torch_copy_(dst, nullptr, 0); + EXPECT_EQ(error, Error::InvalidArgument); + + EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); +} + +// ============================================================================ +// Different Dtype Tests +// ============================================================================ + +TEST_F(AOTITorchCopySlimTest, Int64Copy_CPU) { + std::vector sizes = {2, 3}; + Tensor* src = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Long), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src, nullptr); + + int64_t* src_data = static_cast(src->data_ptr()); + for (int64_t i = 0; i < src->numel(); i++) { + src_data[i] = i * 100; + } - // Create destination tensor with same schema - Tensor* dst = - create_test_tensor_with_data(sizes, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); - EXPECT_NE(dst, nullptr); + Tensor* dst = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Long), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(dst, nullptr); - // Perform copy AOTITorchError error = aoti_torch_copy_(dst, src, 0); EXPECT_EQ(error, Error::Ok); - // Verify copy was successful - EXPECT_TRUE(tensors_equal(dst, src)); -} - -// Test copy with different strides (pointwise fallback) -TEST_F(AOTITorchCopyTest, CopyDifferentStrides) { - // Create source tensor (2x3) with contiguous layout - std::vector src_sizes = {2, 3}; - std::vector src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + int64_t* dst_data = static_cast(dst->data_ptr()); + for (int64_t i = 0; i < dst->numel(); i++) { + EXPECT_EQ(dst_data[i], i * 100); + } - Tensor* src = create_test_tensor_with_data(src_sizes, src_data); - EXPECT_NE(src, nullptr); + EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); +} - // Create destination tensor with transposed strides - std::vector dst_strides = {1, 2}; // Column-major layout - Tensor* dst = create_test_tensor_with_data( - src_sizes, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, dst_strides); - EXPECT_NE(dst, nullptr); +TEST_F(AOTITorchCopySlimTest, BoolCopy_CPU) { + std::vector sizes = {4}; + Tensor* src = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Bool), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src, nullptr); + + bool* src_data = static_cast(src->data_ptr()); + src_data[0] = true; + src_data[1] = false; + src_data[2] = true; + src_data[3] = false; + + Tensor* dst = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Bool), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(dst, nullptr); - // Perform copy - this should use pointwise fallback AOTITorchError error = aoti_torch_copy_(dst, src, 0); EXPECT_EQ(error, Error::Ok); - // Verify the copy worked correctly by checking specific elements - auto dst_data = get_tensor_data(dst); - auto src_data_check = get_tensor_data(src); + bool* dst_data = static_cast(dst->data_ptr()); + EXPECT_EQ(dst_data[0], true); + EXPECT_EQ(dst_data[1], false); + EXPECT_EQ(dst_data[2], true); + EXPECT_EQ(dst_data[3], false); - // For transposed layout, the data should be rearranged - EXPECT_EQ(dst_data.size(), 6); - EXPECT_EQ(src_data_check.size(), 6); + EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); } -// Test copy between CPU and CUDA tensors -TEST_F(AOTITorchCopyTest, CopyCPUToCUDA) { - std::vector sizes = {2, 2}; - std::vector data = {1.0f, 2.0f, 3.0f, 4.0f}; +// ============================================================================ +// Tensor Shape Tests +// ============================================================================ - // Create CPU tensor - Tensor* cpu_tensor = create_test_tensor_with_data( +TEST_F(AOTITorchCopySlimTest, ScalarTensorCopy_CPU) { + std::vector sizes = {}; + Tensor* src = createTestTensor( sizes, - data, {}, - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CPU)); // CPU - EXPECT_NE(cpu_tensor, nullptr); + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src, nullptr); + EXPECT_EQ(src->dim(), 0); + EXPECT_EQ(src->numel(), 1); + + float* src_data = static_cast(src->data_ptr()); + *src_data = 42.0f; - // Create CUDA tensor - Tensor* cuda_tensor = create_test_tensor_with_data( + Tensor* dst = createTestTensor( sizes, - {0.0f, 0.0f, 0.0f, 0.0f}, {}, - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA)); // CUDA - EXPECT_NE(cuda_tensor, nullptr); + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(dst, nullptr); - // Copy from CPU to CUDA - AOTITorchError error = aoti_torch_copy_(cuda_tensor, cpu_tensor, 0); + AOTITorchError error = aoti_torch_copy_(dst, src, 0); EXPECT_EQ(error, Error::Ok); - // Verify copy - EXPECT_TRUE(tensors_equal(cuda_tensor, cpu_tensor)); -} + float* dst_data = static_cast(dst->data_ptr()); + EXPECT_FLOAT_EQ(*dst_data, 42.0f); -// Test copy between CUDA and CPU tensors -TEST_F(AOTITorchCopyTest, CopyCUDAToCPU) { - std::vector sizes = {2, 2}; - std::vector data = {1.0f, 2.0f, 3.0f, 4.0f}; + EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); +} - // Create CUDA tensor - Tensor* cuda_tensor = create_test_tensor_with_data( +TEST_F(AOTITorchCopySlimTest, LargeTensorCopy_CPU) { + std::vector sizes = {100, 100}; + Tensor* src = createTestTensor( sizes, - data, {}, - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA)); // CUDA - EXPECT_NE(cuda_tensor, nullptr); + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src, nullptr); + + float* src_data = static_cast(src->data_ptr()); + for (int64_t i = 0; i < src->numel(); i++) { + src_data[i] = static_cast(i); + } - // Create CPU tensor - Tensor* cpu_tensor = create_test_tensor_with_data( + Tensor* dst = createTestTensor( sizes, - {0.0f, 0.0f, 0.0f, 0.0f}, {}, - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CPU)); // CPU - EXPECT_NE(cpu_tensor, nullptr); + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(dst, nullptr); - // Copy from CUDA to CPU - AOTITorchError error = aoti_torch_copy_(cpu_tensor, cuda_tensor, 0); + AOTITorchError error = aoti_torch_copy_(dst, src, 0); EXPECT_EQ(error, Error::Ok); - // Verify copy - EXPECT_TRUE(tensors_equal(cpu_tensor, cuda_tensor)); + float* dst_data = static_cast(dst->data_ptr()); + for (int64_t i = 0; i < dst->numel(); i++) { + EXPECT_FLOAT_EQ(dst_data[i], static_cast(i)); + } + + EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); } -// Test copy with bf16 dtype support -TEST_F(AOTITorchCopyTest, CopyBf16Tensors) { - // Test that bf16 tensors can be created and copied - std::vector sizes = {2, 3}; - std::vector src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; +// ============================================================================ +// CUDA Tests +// ============================================================================ - // Note: We create float32 data but the tensor will be created with bf16 dtype - // This simulates creating bf16 tensors - Tensor* src = create_test_tensor_with_data( +TEST_F(AOTITorchCopySlimTest, CudaToCuda) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {3, 4}; + + std::vector host_src_data(12); + for (size_t i = 0; i < host_src_data.size(); i++) { + host_src_data[i] = static_cast(i + 1); + } + + Tensor* src = createTestTensor( sizes, - src_data, - {}, // default strides - static_cast(SupportedDTypes::BFLOAT16), // bf16 dtype - static_cast(SupportedDevices::CUDA), // CUDA device - 0 // device_index = 0 - ); - EXPECT_NE(src, nullptr); - - // Create destination tensor with bf16 dtype - std::vector dst_init(6, 0.0f); - Tensor* dst = create_test_tensor_with_data( + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(src, nullptr); + EXPECT_TRUE(src->is_cuda()); + + cudaMemcpy( + src->data_ptr(), + host_src_data.data(), + host_src_data.size() * sizeof(float), + cudaMemcpyHostToDevice); + + Tensor* dst = createTestTensor( sizes, - dst_init, - {}, // default strides - static_cast(SupportedDTypes::BFLOAT16), // bf16 dtype - static_cast(SupportedDevices::CUDA), // CUDA device - 0 // device_index = 0 - ); - EXPECT_NE(dst, nullptr); - - // Perform copy between bf16 tensors + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(dst, nullptr); + EXPECT_TRUE(dst->is_cuda()); + AOTITorchError error = aoti_torch_copy_(dst, src, 0); EXPECT_EQ(error, Error::Ok); - // Verify that both tensors have the expected dtype - int32_t src_dtype, dst_dtype; - aoti_torch_get_dtype(src, &src_dtype); - aoti_torch_get_dtype(dst, &dst_dtype); + std::vector host_dst_data(12); + cudaMemcpy( + host_dst_data.data(), + dst->data_ptr(), + host_dst_data.size() * sizeof(float), + cudaMemcpyDeviceToHost); - EXPECT_EQ(src_dtype, static_cast(SupportedDTypes::BFLOAT16)); - EXPECT_EQ(dst_dtype, static_cast(SupportedDTypes::BFLOAT16)); + for (size_t i = 0; i < host_dst_data.size(); i++) { + EXPECT_FLOAT_EQ(host_dst_data[i], static_cast(i + 1)); + } - // Verify copy was successful by checking numel matches - EXPECT_EQ(src->numel(), dst->numel()); - EXPECT_EQ(src->numel(), 6); + EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); } -// Test copy between different dtypes should fail -TEST_F(AOTITorchCopyTest, CopyDTypeMismatchError) { - std::vector sizes = {2, 2}; - std::vector data = {1.0f, 2.0f, 3.0f, 4.0f}; +TEST_F(AOTITorchCopySlimTest, CpuToCuda) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } - // Create float32 tensor - Tensor* float32_tensor = create_test_tensor_with_data( + std::vector sizes = {2, 3}; + Tensor* src = createTestTensor( sizes, - data, - {}, // default strides - static_cast(SupportedDTypes::FLOAT32), // float32 dtype - static_cast(SupportedDevices::CUDA), // CUDA device - 0 // device_index = 0 - ); - EXPECT_NE(float32_tensor, nullptr); - - // Create bf16 tensor - Tensor* bf16_tensor = create_test_tensor_with_data( + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src, nullptr); + EXPECT_TRUE(src->is_cpu()); + + float* src_data = static_cast(src->data_ptr()); + for (int64_t i = 0; i < src->numel(); i++) { + src_data[i] = static_cast(i * 10); + } + + Tensor* dst = createTestTensor( sizes, - {0.0f, 0.0f, 0.0f, 0.0f}, - {}, // default strides - static_cast(SupportedDTypes::BFLOAT16), // bf16 dtype - static_cast(SupportedDevices::CUDA), // CUDA device - 0 // device_index = 0 - ); - EXPECT_NE(bf16_tensor, nullptr); - - // Attempting to copy between different dtypes should fail - AOTITorchError error = aoti_torch_copy_(bf16_tensor, float32_tensor, 0); - EXPECT_EQ(error, Error::InvalidArgument); + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(dst, nullptr); + EXPECT_TRUE(dst->is_cuda()); - // Reverse direction should also fail - error = aoti_torch_copy_(float32_tensor, bf16_tensor, 0); - EXPECT_EQ(error, Error::InvalidArgument); + AOTITorchError error = aoti_torch_copy_(dst, src, 0); + EXPECT_EQ(error, Error::Ok); + + std::vector host_dst_data(6); + cudaMemcpy( + host_dst_data.data(), + dst->data_ptr(), + host_dst_data.size() * sizeof(float), + cudaMemcpyDeviceToHost); + + for (size_t i = 0; i < host_dst_data.size(); i++) { + EXPECT_FLOAT_EQ(host_dst_data[i], static_cast(i * 10)); + } + + EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); } -// Test error conditions -TEST_F(AOTITorchCopyTest, ErrorHandling) { +TEST_F(AOTITorchCopySlimTest, CudaToCpu) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + std::vector sizes = {2, 3}; - std::vector data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - Tensor* valid_tensor = create_test_tensor_with_data(sizes, data); - EXPECT_NE(valid_tensor, nullptr); + std::vector host_src_data(6); + for (size_t i = 0; i < host_src_data.size(); i++) { + host_src_data[i] = static_cast(i * 5); + } - // Test null pointers - AOTITorchError error = aoti_torch_copy_(nullptr, valid_tensor, 0); - EXPECT_NE(error, Error::Ok); + Tensor* src = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(src, nullptr); + + cudaMemcpy( + src->data_ptr(), + host_src_data.data(), + host_src_data.size() * sizeof(float), + cudaMemcpyHostToDevice); + + Tensor* dst = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(dst, nullptr); + EXPECT_TRUE(dst->is_cpu()); - error = aoti_torch_copy_(valid_tensor, nullptr, 0); - EXPECT_NE(error, Error::Ok); + AOTITorchError error = aoti_torch_copy_(dst, src, 0); + EXPECT_EQ(error, Error::Ok); - // Test numel mismatch (different total number of elements) - std::vector different_numel_sizes = { - 2, 3, 4}; // 24 elements vs 6 elements - std::vector different_data(24, 1.0f); - Tensor* different_numel = - create_test_tensor_with_data(different_numel_sizes, different_data); - EXPECT_NE(different_numel, nullptr); + float* dst_data = static_cast(dst->data_ptr()); + for (int64_t i = 0; i < dst->numel(); i++) { + EXPECT_FLOAT_EQ(dst_data[i], static_cast(i * 5)); + } - error = aoti_torch_copy_(valid_tensor, different_numel, 0); - EXPECT_EQ(error, Error::InvalidArgument); + EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); } -// Test copy from 1D to 3D with same total elements -TEST_F(AOTITorchCopyTest, Copy1DTo3DSameNumel) { - // Source tensor: 8 elements in 1D - std::vector src_sizes = {8}; - std::vector src_data = { - 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; +// ============================================================================ +// Non-blocking Tests +// ============================================================================ - Tensor* src = create_test_tensor_with_data(src_sizes, src_data); - EXPECT_NE(src, nullptr); +TEST_F(AOTITorchCopySlimTest, NonBlockingFlag_CPU) { + std::vector sizes = {2, 3}; + Tensor* src = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src, nullptr); + + float* src_data = static_cast(src->data_ptr()); + for (int64_t i = 0; i < src->numel(); i++) { + src_data[i] = static_cast(i); + } - // Destination tensor: 2x2x2 = 8 elements (different shape, same total) - std::vector dst_sizes = {2, 2, 2}; - std::vector dst_init(8, 0.0f); - Tensor* dst = create_test_tensor_with_data(dst_sizes, dst_init); - EXPECT_NE(dst, nullptr); + Tensor* dst = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(dst, nullptr); - // This should work - same total number of elements - AOTITorchError error = aoti_torch_copy_(dst, src, 0); + AOTITorchError error = aoti_torch_copy_(dst, src, 1); EXPECT_EQ(error, Error::Ok); - // Verify the data was copied correctly - auto dst_data = get_tensor_data(dst); - EXPECT_EQ(dst_data.size(), 8); + float* dst_data = static_cast(dst->data_ptr()); + for (int64_t i = 0; i < dst->numel(); i++) { + EXPECT_FLOAT_EQ(dst_data[i], static_cast(i)); + } - // Check some specific elements to verify correct copying - EXPECT_FLOAT_EQ(dst_data[0], 1.0f); - EXPECT_FLOAT_EQ(dst_data[7], 8.0f); + EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); } diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy__slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy__slim.cpp deleted file mode 100644 index c2e67732b41..00000000000 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy__slim.cpp +++ /dev/null @@ -1,487 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -using namespace executorch::backends::cuda; -using executorch::runtime::Error; - -namespace slim_c10 = executorch::backends::aoti::slim::c10; - -namespace { - -bool isCudaAvailable() { - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - return (err == cudaSuccess && device_count > 0); -} - -std::vector calculateContiguousStrides( - const std::vector& sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) { - return strides; - } - strides[sizes.size() - 1] = 1; - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - return strides; -} - -} // namespace - -class AOTITorchCopySlimTest : public ::testing::Test { - protected: - void SetUp() override { - et_pal_init(); - } - - Tensor* createTestTensor( - const std::vector& sizes, - const std::vector& strides = {}, - int32_t dtype = static_cast(slim_c10::ScalarType::Float), - int32_t device_type = static_cast(slim_c10::DeviceType::CPU), - int32_t device_index = 0) { - Tensor* tensor = nullptr; - - std::vector effective_strides = strides; - if (strides.empty()) { - effective_strides = calculateContiguousStrides(sizes); - } - - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - effective_strides.data(), - dtype, - device_type, - device_index, - &tensor); - - return (error == Error::Ok) ? tensor : nullptr; - } -}; - -// ============================================================================ -// Basic Functionality Tests -// ============================================================================ - -TEST_F(AOTITorchCopySlimTest, BasicCopy_CPU) { - std::vector sizes = {3, 4}; - Tensor* src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src, nullptr); - - float* src_data = static_cast(src->data_ptr()); - for (int64_t i = 0; i < src->numel(); i++) { - src_data[i] = static_cast(i + 1); - } - - Tensor* dst = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(dst, nullptr); - - AOTITorchError error = aoti_torch_copy_(dst, src, 0); - EXPECT_EQ(error, Error::Ok); - - float* dst_data = static_cast(dst->data_ptr()); - for (int64_t i = 0; i < dst->numel(); i++) { - EXPECT_FLOAT_EQ(dst_data[i], static_cast(i + 1)); - } - - EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); -} - -TEST_F(AOTITorchCopySlimTest, NullSelf) { - std::vector sizes = {2, 3}; - Tensor* src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src, nullptr); - - AOTITorchError error = aoti_torch_copy_(nullptr, src, 0); - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); -} - -TEST_F(AOTITorchCopySlimTest, NullSrc) { - std::vector sizes = {2, 3}; - Tensor* dst = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(dst, nullptr); - - AOTITorchError error = aoti_torch_copy_(dst, nullptr, 0); - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); -} - -// ============================================================================ -// Different Dtype Tests -// ============================================================================ - -TEST_F(AOTITorchCopySlimTest, Int64Copy_CPU) { - std::vector sizes = {2, 3}; - Tensor* src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Long), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src, nullptr); - - int64_t* src_data = static_cast(src->data_ptr()); - for (int64_t i = 0; i < src->numel(); i++) { - src_data[i] = i * 100; - } - - Tensor* dst = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Long), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(dst, nullptr); - - AOTITorchError error = aoti_torch_copy_(dst, src, 0); - EXPECT_EQ(error, Error::Ok); - - int64_t* dst_data = static_cast(dst->data_ptr()); - for (int64_t i = 0; i < dst->numel(); i++) { - EXPECT_EQ(dst_data[i], i * 100); - } - - EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); -} - -TEST_F(AOTITorchCopySlimTest, BoolCopy_CPU) { - std::vector sizes = {4}; - Tensor* src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Bool), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src, nullptr); - - bool* src_data = static_cast(src->data_ptr()); - src_data[0] = true; - src_data[1] = false; - src_data[2] = true; - src_data[3] = false; - - Tensor* dst = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Bool), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(dst, nullptr); - - AOTITorchError error = aoti_torch_copy_(dst, src, 0); - EXPECT_EQ(error, Error::Ok); - - bool* dst_data = static_cast(dst->data_ptr()); - EXPECT_EQ(dst_data[0], true); - EXPECT_EQ(dst_data[1], false); - EXPECT_EQ(dst_data[2], true); - EXPECT_EQ(dst_data[3], false); - - EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); -} - -// ============================================================================ -// Tensor Shape Tests -// ============================================================================ - -TEST_F(AOTITorchCopySlimTest, ScalarTensorCopy_CPU) { - std::vector sizes = {}; - Tensor* src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src, nullptr); - EXPECT_EQ(src->dim(), 0); - EXPECT_EQ(src->numel(), 1); - - float* src_data = static_cast(src->data_ptr()); - *src_data = 42.0f; - - Tensor* dst = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(dst, nullptr); - - AOTITorchError error = aoti_torch_copy_(dst, src, 0); - EXPECT_EQ(error, Error::Ok); - - float* dst_data = static_cast(dst->data_ptr()); - EXPECT_FLOAT_EQ(*dst_data, 42.0f); - - EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); -} - -TEST_F(AOTITorchCopySlimTest, LargeTensorCopy_CPU) { - std::vector sizes = {100, 100}; - Tensor* src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src, nullptr); - - float* src_data = static_cast(src->data_ptr()); - for (int64_t i = 0; i < src->numel(); i++) { - src_data[i] = static_cast(i); - } - - Tensor* dst = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(dst, nullptr); - - AOTITorchError error = aoti_torch_copy_(dst, src, 0); - EXPECT_EQ(error, Error::Ok); - - float* dst_data = static_cast(dst->data_ptr()); - for (int64_t i = 0; i < dst->numel(); i++) { - EXPECT_FLOAT_EQ(dst_data[i], static_cast(i)); - } - - EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); -} - -// ============================================================================ -// CUDA Tests -// ============================================================================ - -TEST_F(AOTITorchCopySlimTest, CudaToCuda) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {3, 4}; - - std::vector host_src_data(12); - for (size_t i = 0; i < host_src_data.size(); i++) { - host_src_data[i] = static_cast(i + 1); - } - - Tensor* src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(src, nullptr); - EXPECT_TRUE(src->is_cuda()); - - cudaMemcpy( - src->data_ptr(), - host_src_data.data(), - host_src_data.size() * sizeof(float), - cudaMemcpyHostToDevice); - - Tensor* dst = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(dst, nullptr); - EXPECT_TRUE(dst->is_cuda()); - - AOTITorchError error = aoti_torch_copy_(dst, src, 0); - EXPECT_EQ(error, Error::Ok); - - std::vector host_dst_data(12); - cudaMemcpy( - host_dst_data.data(), - dst->data_ptr(), - host_dst_data.size() * sizeof(float), - cudaMemcpyDeviceToHost); - - for (size_t i = 0; i < host_dst_data.size(); i++) { - EXPECT_FLOAT_EQ(host_dst_data[i], static_cast(i + 1)); - } - - EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); -} - -TEST_F(AOTITorchCopySlimTest, CpuToCuda) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - Tensor* src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src, nullptr); - EXPECT_TRUE(src->is_cpu()); - - float* src_data = static_cast(src->data_ptr()); - for (int64_t i = 0; i < src->numel(); i++) { - src_data[i] = static_cast(i * 10); - } - - Tensor* dst = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(dst, nullptr); - EXPECT_TRUE(dst->is_cuda()); - - AOTITorchError error = aoti_torch_copy_(dst, src, 0); - EXPECT_EQ(error, Error::Ok); - - std::vector host_dst_data(6); - cudaMemcpy( - host_dst_data.data(), - dst->data_ptr(), - host_dst_data.size() * sizeof(float), - cudaMemcpyDeviceToHost); - - for (size_t i = 0; i < host_dst_data.size(); i++) { - EXPECT_FLOAT_EQ(host_dst_data[i], static_cast(i * 10)); - } - - EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); -} - -TEST_F(AOTITorchCopySlimTest, CudaToCpu) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - - std::vector host_src_data(6); - for (size_t i = 0; i < host_src_data.size(); i++) { - host_src_data[i] = static_cast(i * 5); - } - - Tensor* src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(src, nullptr); - - cudaMemcpy( - src->data_ptr(), - host_src_data.data(), - host_src_data.size() * sizeof(float), - cudaMemcpyHostToDevice); - - Tensor* dst = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(dst, nullptr); - EXPECT_TRUE(dst->is_cpu()); - - AOTITorchError error = aoti_torch_copy_(dst, src, 0); - EXPECT_EQ(error, Error::Ok); - - float* dst_data = static_cast(dst->data_ptr()); - for (int64_t i = 0; i < dst->numel(); i++) { - EXPECT_FLOAT_EQ(dst_data[i], static_cast(i * 5)); - } - - EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); -} - -// ============================================================================ -// Non-blocking Tests -// ============================================================================ - -TEST_F(AOTITorchCopySlimTest, NonBlockingFlag_CPU) { - std::vector sizes = {2, 3}; - Tensor* src = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(src, nullptr); - - float* src_data = static_cast(src->data_ptr()); - for (int64_t i = 0; i < src->numel(); i++) { - src_data[i] = static_cast(i); - } - - Tensor* dst = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(dst, nullptr); - - AOTITorchError error = aoti_torch_copy_(dst, src, 1); - EXPECT_EQ(error, Error::Ok); - - float* dst_data = static_cast(dst->data_ptr()); - for (int64_t i = 0; i < dst->numel(); i++) { - EXPECT_FLOAT_EQ(dst_data[i], static_cast(i)); - } - - EXPECT_EQ(aoti_torch_delete_tensor_object(src), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(dst), Error::Ok); -} diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp index db0ab84970d..21f8c79cc46 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp @@ -7,380 +7,271 @@ */ #include -#include -#include -#include -#include -#include -#include #include #include -using namespace executorch::backends::aoti; +#include +#include +#include +#include +#include + using namespace executorch::backends::cuda; -using namespace executorch::runtime; -using executorch::runtime::etensor::Tensor; +using executorch::runtime::Error; -// Test fixture for aoti_torch_create_tensor_from_blob_v2 tests -class AOTITorchCreateTensorFromBlobV2Test : public ::testing::Test { - protected: - void SetUp() override { - // Initialize ExecuTorch Platform Abstraction Layer - et_pal_init(); +namespace slim_c10 = executorch::backends::aoti::slim::c10; - // Check if CUDA is available - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - if (err != cudaSuccess || device_count == 0) { - GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; - } +namespace { + +// Helper to check if CUDA is available +bool isCudaAvailable() { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + return (err == cudaSuccess && device_count > 0); +} + +// Helper to calculate contiguous strides from sizes +std::vector calculateContiguousStrides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; + } + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + return strides; +} - // Clean up any existing cached metadata before each test - cleanup_tensor_metadata(); +// Helper to calculate numel from sizes +int64_t calculateNumel(const std::vector& sizes) { + int64_t numel = 1; + for (int64_t size : sizes) { + numel *= size; + } + return numel; +} - // Clear any remaining tensors from previous tests - clear_all_tensors(); +} // namespace + +// Test fixture for SlimTensor-based aoti_torch_create_tensor_from_blob_v2 tests +class AOTITorchCreateTensorFromBlobV2SlimTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); } void TearDown() override { - // Clean up metadata - cleanup_tensor_metadata(); - - // Clear the global tensor storage using the provided function - clear_all_tensors(); - - // Clean up any allocated memory buffers - for (void* ptr : cuda_memory_buffers_) { - if (ptr) { - cudaError_t cuda_err = cudaFree(ptr); - EXPECT_EQ(cuda_err, cudaSuccess) - << "Failed to free CUDA memory: " << cudaGetErrorString(cuda_err); + // Clean up tensors + for (Tensor* t : tensors_) { + delete t; + } + tensors_.clear(); + + // Clean up CUDA memory + for (void* ptr : cuda_memory_) { + if (ptr != nullptr) { + cudaFree(ptr); } } - cuda_memory_buffers_.clear(); + cuda_memory_.clear(); - for (void* ptr : cpu_memory_buffers_) { - if (ptr) { + // Clean up CPU memory + for (void* ptr : cpu_memory_) { + if (ptr != nullptr) { free(ptr); } } - cpu_memory_buffers_.clear(); + cpu_memory_.clear(); } - // Helper to allocate CUDA memory and track it for cleanup - void* allocate_cuda_memory(size_t bytes) { - void* ptr; - cudaError_t err = cudaMallocManaged(&ptr, bytes); - if (err == cudaSuccess) { - cuda_memory_buffers_.push_back(ptr); - return ptr; + void* allocateCudaMemory(size_t bytes) { + void* ptr = nullptr; + cudaError_t err = cudaMalloc(&ptr, bytes); + if (err == cudaSuccess && ptr != nullptr) { + cuda_memory_.push_back(ptr); } - return nullptr; + return ptr; } - // Helper to allocate CPU memory and track it for cleanup - void* allocate_cpu_memory(size_t bytes) { - void* ptr; - int result = posix_memalign(&ptr, 16, bytes); // 16-byte aligned + void* allocateCpuMemory(size_t bytes) { + void* ptr = nullptr; + int result = posix_memalign(&ptr, 16, bytes); if (result == 0 && ptr != nullptr) { - cpu_memory_buffers_.push_back(ptr); - return ptr; + cpu_memory_.push_back(ptr); } - return nullptr; + return ptr; } - // Helper to calculate number of elements from sizes - int64_t calculate_numel(const std::vector& sizes) { - int64_t numel = 1; - for (int64_t size : sizes) { - numel *= size; + void trackTensor(Tensor* t) { + if (t != nullptr) { + tensors_.push_back(t); } - return numel; - } - - // Helper to calculate contiguous strides from sizes - std::vector calculate_contiguous_strides( - const std::vector& sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) { - return strides; - } - - strides[sizes.size() - 1] = 1; - // Use int64_t and check for underflow to avoid unsigned integer wraparound - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - return strides; } private: - std::vector cuda_memory_buffers_; - std::vector cpu_memory_buffers_; + std::vector tensors_; + std::vector cuda_memory_; + std::vector cpu_memory_; }; -// Test basic functionality with CUDA memory -TEST_F(AOTITorchCreateTensorFromBlobV2Test, BasicFunctionalityCUDA) { - // Test 1D tensor - std::vector sizes_1d = {5}; - std::vector strides_1d = calculate_contiguous_strides(sizes_1d); - - // Allocate CUDA memory - size_t bytes = calculate_numel(sizes_1d) * sizeof(float); - void* cuda_data = allocate_cuda_memory(bytes); - ASSERT_NE(cuda_data, nullptr); - - Tensor* tensor_1d; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - cuda_data, - sizes_1d.size(), - sizes_1d.data(), - strides_1d.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor_1d, - 0, // layout (strided) - nullptr, // opaque_metadata - 0); // opaque_metadata_size - - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor_1d, nullptr); - - // Check tensor properties - EXPECT_EQ(tensor_1d->dim(), 1); - EXPECT_EQ(tensor_1d->size(0), 5); +// ============================================================================ +// Common test body - parameterized by device type +// ============================================================================ - // Verify the tensor uses the same data pointer - void* tensor_data = tensor_1d->mutable_data_ptr(); - EXPECT_EQ(tensor_data, cuda_data); - - // Delete the tensor - this should NOT free the original memory - error = aoti_torch_delete_tensor_object(tensor_1d); - EXPECT_EQ(error, Error::Ok); - - // Test that the original memory is still accessible (proves tensor didn't own - // it) For CUDA memory, check that we can still access it (synchronously) - // after tensor deletion - float pattern_value = 42.0f; - cudaError_t cuda_err = cudaMemcpy( - cuda_data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); - EXPECT_EQ(cuda_err, cudaSuccess) - << "Should be able to write to original CUDA memory after tensor deletion"; - - float readback_value = 0.0f; - cuda_err = cudaMemcpy( - &readback_value, cuda_data, sizeof(float), cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess) - << "Should be able to read from original CUDA memory after tensor deletion"; - EXPECT_EQ(readback_value, pattern_value) - << "Original CUDA memory should still contain our test pattern"; -} - -// Test basic functionality with CPU memory -TEST_F(AOTITorchCreateTensorFromBlobV2Test, BasicFunctionalityCPU) { - // Test 2D tensor - std::vector sizes_2d = {3, 4}; - std::vector strides_2d = calculate_contiguous_strides(sizes_2d); - - // Allocate CPU memory - size_t bytes = calculate_numel(sizes_2d) * sizeof(float); - void* cpu_data = allocate_cpu_memory(bytes); - ASSERT_NE(cpu_data, nullptr); +void runBasicFromBlobTest( + AOTITorchCreateTensorFromBlobV2SlimTest* fixture, + void* data, + int32_t device_type, + int32_t device_index) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); - Tensor* tensor_2d; + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - cpu_data, - sizes_2d.size(), - sizes_2d.data(), - strides_2d.data(), + data, + sizes.size(), + sizes.data(), + strides.data(), 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CPU), - 0, // device index - &tensor_2d, - 0, // layout (strided) + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, + &tensor, + 0, // layout nullptr, // opaque_metadata 0); // opaque_metadata_size EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor_2d, nullptr); + ASSERT_NE(tensor, nullptr); // Check tensor properties - EXPECT_EQ(tensor_2d->dim(), 2); - EXPECT_EQ(tensor_2d->size(0), 3); - EXPECT_EQ(tensor_2d->size(1), 4); - - // Verify the tensor uses the same data pointer - void* tensor_data = tensor_2d->mutable_data_ptr(); - EXPECT_EQ(tensor_data, cpu_data); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); + EXPECT_EQ(tensor->numel(), 6); + EXPECT_EQ( + static_cast(tensor->dtype()), + static_cast(slim_c10::ScalarType::Float)); - // Delete the tensor - this should NOT free the original memory - error = aoti_torch_delete_tensor_object(tensor_2d); - EXPECT_EQ(error, Error::Ok); + // Verify the tensor uses the same data pointer (non-owning) + EXPECT_EQ(tensor->data_ptr(), data); - // Test that the original memory is still accessible (proves tensor didn't own - // it) For CPU memory, directly write and read to verify accessibility - float* float_ptr = reinterpret_cast(cpu_data); - float pattern_value = 42.0f; - *float_ptr = pattern_value; - EXPECT_EQ(*float_ptr, pattern_value) - << "Original CPU memory should still be accessible after tensor deletion"; + // Cleanup - tensor should NOT free the original memory + delete tensor; } -// Test with invalid dtype -TEST_F(AOTITorchCreateTensorFromBlobV2Test, InvalidDtype) { - std::vector sizes = {2, 3}; - std::vector strides = calculate_contiguous_strides(sizes); - - size_t bytes = calculate_numel(sizes) * sizeof(float); - void* data = allocate_cuda_memory(bytes); - ASSERT_NE(data, nullptr); +void runScalarFromBlobTest( + AOTITorchCreateTensorFromBlobV2SlimTest* fixture, + void* data, + int32_t device_type, + int32_t device_index) { + std::vector sizes = {}; // 0D tensor + std::vector strides = {}; - Tensor* tensor; + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( data, sizes.size(), sizes.data(), strides.data(), 0, // storage_offset - 999, // invalid dtype - static_cast(SupportedDevices::CUDA), - 0, // device index + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, &tensor, 0, // layout nullptr, // opaque_metadata 0); // opaque_metadata_size - EXPECT_EQ(error, Error::InvalidArgument); -} - -// Test with non-zero storage offset (should fail since from_blob cannot handle -// offsets) -TEST_F(AOTITorchCreateTensorFromBlobV2Test, NonZeroStorageOffset) { - std::vector sizes = {2, 3}; - std::vector strides = calculate_contiguous_strides(sizes); - - size_t bytes = calculate_numel(sizes) * sizeof(float); - void* data = allocate_cuda_memory(bytes); - ASSERT_NE(data, nullptr); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); - Tensor* tensor; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 1, // non-zero storage_offset (should fail since from_blob cannot handle - // offsets) - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size + EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->numel(), 1); + EXPECT_EQ(tensor->data_ptr(), data); - EXPECT_EQ(error, Error::InvalidArgument); + delete tensor; } -// Test with custom strides (using stride parameter but still contiguous) -TEST_F(AOTITorchCreateTensorFromBlobV2Test, CustomContiguousStrides) { - std::vector sizes = {2, 3}; - // Use the correct contiguous strides but pass them explicitly - std::vector contiguous_strides = {3, 1}; // Proper contiguous strides - - size_t bytes = calculate_numel(sizes) * sizeof(float); - void* data = allocate_cuda_memory(bytes); - ASSERT_NE(data, nullptr); +void runMultiDimensionalFromBlobTest( + AOTITorchCreateTensorFromBlobV2SlimTest* fixture, + void* data, + int32_t device_type, + int32_t device_index) { + std::vector sizes = {2, 3, 4}; + std::vector strides = calculateContiguousStrides(sizes); - Tensor* tensor; + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( data, sizes.size(), sizes.data(), - contiguous_strides.data(), // Explicitly pass contiguous strides + strides.data(), 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, &tensor, 0, // layout nullptr, // opaque_metadata 0); // opaque_metadata_size EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); + ASSERT_NE(tensor, nullptr); - // Check tensor properties - EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->dim(), 3); EXPECT_EQ(tensor->size(0), 2); EXPECT_EQ(tensor->size(1), 3); + EXPECT_EQ(tensor->size(2), 4); + EXPECT_EQ(tensor->numel(), 24); + EXPECT_EQ(tensor->data_ptr(), data); - // Verify the tensor uses the same data pointer - void* tensor_data = tensor->mutable_data_ptr(); - EXPECT_EQ(tensor_data, data); - - // Verify strides were properly set (we can check via aoti_torch_get_strides) - int64_t* tensor_strides; - error = aoti_torch_get_strides(tensor, &tensor_strides); - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(tensor_strides[0], 3); - EXPECT_EQ(tensor_strides[1], 1); - - // Delete the tensor - this should NOT free the original memory - error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); - - // Test that the original memory is still accessible (proves tensor didn't own - // it) - float pattern_value = 42.0f; - cudaError_t cuda_err = - cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); - EXPECT_EQ(cuda_err, cudaSuccess) - << "Should be able to write to original CUDA memory after tensor deletion"; - - float readback_value = 0.0f; - cuda_err = - cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess) - << "Should be able to read from original CUDA memory after tensor deletion"; - EXPECT_EQ(readback_value, pattern_value) - << "Original CUDA memory should still contain our test pattern"; + delete tensor; } -// Test with null data pointer -TEST_F(AOTITorchCreateTensorFromBlobV2Test, NullDataPointer) { - std::vector sizes = {2, 3}; - std::vector strides = calculate_contiguous_strides(sizes); +void runCustomStridesFromBlobTest( + AOTITorchCreateTensorFromBlobV2SlimTest* fixture, + void* data, + int32_t device_type, + int32_t device_index) { + std::vector sizes = {3, 4}; + std::vector strides = {1, 3}; // Column-major - Tensor* tensor; + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - nullptr, // null data pointer + data, sizes.size(), sizes.data(), strides.data(), 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, &tensor, 0, // layout nullptr, // opaque_metadata 0); // opaque_metadata_size - EXPECT_EQ(error, Error::InvalidArgument); -} + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); -// Test scalar tensor (0D) -TEST_F(AOTITorchCreateTensorFromBlobV2Test, ScalarTensor) { - std::vector sizes = {}; // 0D tensor - std::vector strides = {}; // Empty strides for scalar + EXPECT_EQ(tensor->stride(0), 1); + EXPECT_EQ(tensor->stride(1), 3); + EXPECT_FALSE(tensor->is_contiguous()); + EXPECT_EQ(tensor->data_ptr(), data); - size_t bytes = sizeof(float); // Single element - void* data = allocate_cuda_memory(bytes); - ASSERT_NE(data, nullptr); + delete tensor; +} + +void runStorageOffsetFromBlobTest( + AOTITorchCreateTensorFromBlobV2SlimTest* fixture, + void* data, + int32_t device_type, + int32_t device_index) { + std::vector sizes = {2, 2}; + std::vector strides = calculateContiguousStrides(sizes); Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( @@ -388,10 +279,10 @@ TEST_F(AOTITorchCreateTensorFromBlobV2Test, ScalarTensor) { sizes.size(), sizes.data(), strides.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index + 2, // storage_offset = 2 elements + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, &tensor, 0, // layout nullptr, // opaque_metadata @@ -400,420 +291,343 @@ TEST_F(AOTITorchCreateTensorFromBlobV2Test, ScalarTensor) { EXPECT_EQ(error, Error::Ok); ASSERT_NE(tensor, nullptr); - // Check tensor properties - EXPECT_EQ(tensor->dim(), 0); - - // Verify the tensor uses the same data pointer - void* tensor_data = tensor->mutable_data_ptr(); - EXPECT_EQ(tensor_data, data); - - // Delete the tensor - this should NOT free the original memory - error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(tensor->storage_offset(), 2); + // data_ptr should point to base + offset * itemsize + char* expected_ptr = static_cast(data) + 2 * sizeof(float); + EXPECT_EQ(tensor->data_ptr(), expected_ptr); - // Test that the original memory is still accessible (proves tensor didn't own - // it) - float pattern_value = 42.0f; - cudaError_t cuda_err = - cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); - EXPECT_EQ(cuda_err, cudaSuccess) - << "Should be able to write to original CUDA memory after tensor deletion"; - - float readback_value = 0.0f; - cuda_err = - cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess) - << "Should be able to read from original CUDA memory after tensor deletion"; - EXPECT_EQ(readback_value, pattern_value) - << "Original CUDA memory should still contain our test pattern"; + delete tensor; } -// Test zero-sized tensor -TEST_F(AOTITorchCreateTensorFromBlobV2Test, ZeroSizedTensor) { - std::vector sizes = {0, 5}; // Zero elements - std::vector strides = calculate_contiguous_strides(sizes); +// ============================================================================ +// CPU Tests +// ============================================================================ - // Even for zero-sized tensor, we need some memory allocated - size_t bytes = sizeof(float); // Minimum allocation - void* data = allocate_cuda_memory(bytes); +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CPU) { + size_t bytes = 6 * sizeof(float); + void* data = allocateCpuMemory(bytes); ASSERT_NE(data, nullptr); - Tensor* tensor; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size + runBasicFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CPU), 0); +} - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CPU) { + size_t bytes = sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); - // Check tensor properties - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), 0); - EXPECT_EQ(tensor->size(1), 5); + runScalarFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CPU), 0); +} - // Verify the tensor uses the same data pointer - void* tensor_data = tensor->mutable_data_ptr(); - EXPECT_EQ(tensor_data, data); +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CPU) { + size_t bytes = 24 * sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); - // Delete the tensor - this should NOT free the original memory - error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); + runMultiDimensionalFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CPU), 0); +} - // Test that the original memory is still accessible (proves tensor didn't own - // it) - float pattern_value = 42.0f; - cudaError_t cuda_err = - cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); - EXPECT_EQ(cuda_err, cudaSuccess) - << "Should be able to write to original CUDA memory after tensor deletion"; - - float readback_value = 0.0f; - cuda_err = - cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess) - << "Should be able to read from original CUDA memory after tensor deletion"; - EXPECT_EQ(readback_value, pattern_value) - << "Original CUDA memory should still contain our test pattern"; +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CPU) { + size_t bytes = 12 * sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); + + runCustomStridesFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CPU), 0); } -// Test multi-dimensional tensors -TEST_F(AOTITorchCreateTensorFromBlobV2Test, MultiDimensionalTensors) { - // Test 3D tensor - std::vector sizes_3d = {2, 3, 4}; - std::vector strides_3d = calculate_contiguous_strides(sizes_3d); +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CPU) { + // Allocate extra space for offset + size_t bytes = 6 * sizeof(float); // 2 for offset + 4 for tensor + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); - size_t bytes_3d = calculate_numel(sizes_3d) * sizeof(float); - void* data_3d = allocate_cuda_memory(bytes_3d); - ASSERT_NE(data_3d, nullptr); + runStorageOffsetFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CPU), 0); +} - Tensor* tensor_3d; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data_3d, - sizes_3d.size(), - sizes_3d.data(), - strides_3d.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor_3d, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size +// ============================================================================ +// CUDA Tests +// ============================================================================ - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor_3d, nullptr); - EXPECT_EQ(tensor_3d->dim(), 3); - EXPECT_EQ(tensor_3d->size(0), 2); - EXPECT_EQ(tensor_3d->size(1), 3); - EXPECT_EQ(tensor_3d->size(2), 4); - - // Test 4D tensor - std::vector sizes_4d = {2, 3, 4, 5}; - std::vector strides_4d = calculate_contiguous_strides(sizes_4d); - - size_t bytes_4d = calculate_numel(sizes_4d) * sizeof(float); - void* data_4d = allocate_cuda_memory(bytes_4d); - ASSERT_NE(data_4d, nullptr); - - Tensor* tensor_4d; - error = aoti_torch_create_tensor_from_blob_v2( - data_4d, - sizes_4d.size(), - sizes_4d.data(), - strides_4d.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor_4d, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor_4d, nullptr); - EXPECT_EQ(tensor_4d->dim(), 4); - EXPECT_EQ(tensor_4d->size(0), 2); - EXPECT_EQ(tensor_4d->size(1), 3); - EXPECT_EQ(tensor_4d->size(2), 4); - EXPECT_EQ(tensor_4d->size(3), 5); + size_t bytes = 6 * sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); + + runBasicFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CUDA), 0); } -// Test tensor data pointer consistency -TEST_F(AOTITorchCreateTensorFromBlobV2Test, DataPointerConsistency) { - std::vector sizes = {2, 3}; - std::vector strides = calculate_contiguous_strides(sizes); +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } - size_t bytes = calculate_numel(sizes) * sizeof(float); - void* original_data = allocate_cuda_memory(bytes); - ASSERT_NE(original_data, nullptr); + size_t bytes = sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); - Tensor* tensor; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - original_data, - sizes.size(), - sizes.data(), - strides.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size + runScalarFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CUDA), 0); +} - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + size_t bytes = 24 * sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); - // Check that the tensor uses the same data pointer - void* tensor_data = tensor->mutable_data_ptr(); - EXPECT_EQ(tensor_data, original_data); + runMultiDimensionalFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CUDA), 0); } -// Test creating multiple tensors from different blobs -TEST_F(AOTITorchCreateTensorFromBlobV2Test, MultipleTensorsFromBlobs) { - const int num_tensors = 5; - std::vector tensors; - std::vector data_ptrs; - - for (int i = 0; i < num_tensors; i++) { - std::vector sizes = {i + 1, i + 2}; - std::vector strides = calculate_contiguous_strides(sizes); - - size_t bytes = calculate_numel(sizes) * sizeof(float); - void* data = allocate_cuda_memory(bytes); - ASSERT_NE(data, nullptr); - data_ptrs.push_back(data); - - Tensor* tensor; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size - - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); - tensors.push_back(tensor); - - // Verify dimensions - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), i + 1); - EXPECT_EQ(tensor->size(1), i + 2); - - // Verify the tensor uses the correct data pointer - EXPECT_EQ(tensor->mutable_data_ptr(), data); +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; } - // Verify all tensors have different data pointers - for (int i = 0; i < num_tensors; i++) { - EXPECT_EQ(tensors[i]->mutable_data_ptr(), data_ptrs[i]); - for (int j = i + 1; j < num_tensors; j++) { - EXPECT_NE(tensors[i]->mutable_data_ptr(), tensors[j]->mutable_data_ptr()); - } + size_t bytes = 12 * sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); + + runCustomStridesFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CUDA), 0); +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; } + + // Allocate extra space for offset + size_t bytes = 6 * sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); + + runStorageOffsetFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CUDA), 0); } -// Test deletion of tensor created from blob (should not free the original -// memory) -TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeletionDoesNotFreeOriginalMemory) { - std::vector sizes = {2, 3}; - std::vector strides = calculate_contiguous_strides(sizes); +// ============================================================================ +// Verify Non-Owning Behavior +// ============================================================================ - size_t bytes = calculate_numel(sizes) * sizeof(float); - void* data = allocate_cuda_memory(bytes); +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CPU) { + size_t bytes = 6 * sizeof(float); + void* data = allocateCpuMemory(bytes); ASSERT_NE(data, nullptr); - Tensor* tensor; + // Write a pattern + float* float_data = static_cast(data); + float_data[0] = 42.0f; + + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( data, sizes.size(), sizes.data(), strides.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0, &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size + 0, + nullptr, + 0); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); + ASSERT_NE(tensor, nullptr); - // Delete the tensor - this should NOT free the original memory - error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); + // Delete tensor - memory should NOT be freed + delete tensor; + tensor = nullptr; - // The original memory should still be valid (we'll free it in teardown) - // We can't easily test if the memory is still valid without risking crashes, - // but the test should pass without issues if memory management is correct + // Memory should still be accessible + EXPECT_FLOAT_EQ(float_data[0], 42.0f); } -// Test with opaque metadata -TEST_F(AOTITorchCreateTensorFromBlobV2Test, WithOpaqueMetadata) { - std::vector sizes = {2, 3}; - std::vector strides = calculate_contiguous_strides(sizes); +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } - size_t bytes = calculate_numel(sizes) * sizeof(float); - void* data = allocate_cuda_memory(bytes); + size_t bytes = 6 * sizeof(float); + void* data = allocateCudaMemory(bytes); ASSERT_NE(data, nullptr); - // Create some opaque metadata - std::vector metadata = {0x01, 0x02, 0x03, 0x04}; + // Write a pattern + float pattern = 42.0f; + cudaMemcpy(data, &pattern, sizeof(float), cudaMemcpyHostToDevice); - Tensor* tensor; + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( data, sizes.size(), sizes.data(), strides.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0, &tensor, - 0, // layout - metadata.data(), // opaque_metadata - metadata.size()); // opaque_metadata_size + 0, + nullptr, + 0); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); + ASSERT_NE(tensor, nullptr); - // Check tensor properties - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), 2); - EXPECT_EQ(tensor->size(1), 3); -} + // Delete tensor - memory should NOT be freed + delete tensor; + tensor = nullptr; -// Test stress test with many small tensors from blobs -TEST_F(AOTITorchCreateTensorFromBlobV2Test, StressTestManySmallTensors) { - const int num_tensors = 50; // Reduced for reasonable test time - std::vector tensors; + // Memory should still be accessible + float readback = 0.0f; + cudaError_t cuda_err = + cudaMemcpy(&readback, data, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess); + EXPECT_FLOAT_EQ(readback, 42.0f); +} - for (int i = 0; i < num_tensors; i++) { - std::vector sizes = {1, 1}; // Minimal size - std::vector strides = calculate_contiguous_strides(sizes); +// ============================================================================ +// Error Cases +// ============================================================================ - size_t bytes = calculate_numel(sizes) * sizeof(float); - void* data = allocate_cuda_memory(bytes); - if (data == nullptr) { - // Skip if we run out of memory - continue; - } +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullDataPointer) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); - Tensor* tensor; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size - - if (error == Error::Ok && tensor != nullptr) { - tensors.push_back(tensor); - - // Verify the tensor uses the correct data pointer - EXPECT_EQ(tensor->mutable_data_ptr(), data); - } - } + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + nullptr, // null data + sizes.size(), + sizes.data(), + strides.data(), + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0, + &tensor, + 0, + nullptr, + 0); - // Delete all created tensors - for (Tensor* tensor : tensors) { - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); - } + EXPECT_EQ(error, Error::InvalidArgument); } -// Test device type mismatch: CPU data with CUDA device request should fail -TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeviceMismatchCPUDataCUDADevice) { +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullReturnPointer) { + size_t bytes = 6 * sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); + std::vector sizes = {2, 3}; - std::vector strides = calculate_contiguous_strides(sizes); + std::vector strides = calculateContiguousStrides(sizes); + + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0, + nullptr, // null return pointer + 0, + nullptr, + 0); + + EXPECT_EQ(error, Error::InvalidArgument); +} - // Allocate CPU memory - size_t bytes = calculate_numel(sizes) * sizeof(float); - void* cpu_data = allocate_cpu_memory(bytes); - ASSERT_NE(cpu_data, nullptr); +// ============================================================================ +// Verify Device Properties +// ============================================================================ + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCPUDevice) { + size_t bytes = 6 * sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); + + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); - Tensor* tensor; - // Request CUDA device but provide CPU memory - should fail + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - cpu_data, + data, sizes.size(), sizes.data(), strides.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), // Request CUDA - 0, // device index + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0, &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size + 0, + nullptr, + 0); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + EXPECT_TRUE(tensor->is_cpu()); + EXPECT_FALSE(tensor->is_cuda()); + EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CPU); - EXPECT_EQ(error, Error::InvalidArgument) - << "Should fail when CPU data is provided but CUDA device is requested"; + delete tensor; } -// Test device type mismatch: CUDA data with CPU device request should fail -TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeviceMismatchCUDADataCPUDevice) { - std::vector sizes = {2, 3}; - std::vector strides = calculate_contiguous_strides(sizes); +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCUDADevice) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } - // Allocate CUDA memory (device memory, not managed) - size_t bytes = calculate_numel(sizes) * sizeof(float); - void* cuda_data = nullptr; - cudaError_t cuda_err = cudaMalloc(&cuda_data, bytes); - ASSERT_EQ(cuda_err, cudaSuccess); - ASSERT_NE(cuda_data, nullptr); + size_t bytes = 6 * sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); + + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); - Tensor* tensor; - // Request CPU device but provide CUDA memory - should fail + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - cuda_data, + data, sizes.size(), sizes.data(), strides.data(), - 0, // storage_offset - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CPU), // Request CPU - 0, // device index + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0, &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size + 0, + nullptr, + 0); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); - EXPECT_EQ(error, Error::InvalidArgument) - << "Should fail when CUDA data is provided but CPU device is requested"; + EXPECT_FALSE(tensor->is_cpu()); + EXPECT_TRUE(tensor->is_cuda()); + EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CUDA); - // Clean up the CUDA memory we allocated directly - cudaFree(cuda_data); + delete tensor; } diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp deleted file mode 100644 index 21f8c79cc46..00000000000 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp +++ /dev/null @@ -1,633 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -using namespace executorch::backends::cuda; -using executorch::runtime::Error; - -namespace slim_c10 = executorch::backends::aoti::slim::c10; - -namespace { - -// Helper to check if CUDA is available -bool isCudaAvailable() { - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - return (err == cudaSuccess && device_count > 0); -} - -// Helper to calculate contiguous strides from sizes -std::vector calculateContiguousStrides( - const std::vector& sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) { - return strides; - } - strides[sizes.size() - 1] = 1; - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - return strides; -} - -// Helper to calculate numel from sizes -int64_t calculateNumel(const std::vector& sizes) { - int64_t numel = 1; - for (int64_t size : sizes) { - numel *= size; - } - return numel; -} - -} // namespace - -// Test fixture for SlimTensor-based aoti_torch_create_tensor_from_blob_v2 tests -class AOTITorchCreateTensorFromBlobV2SlimTest : public ::testing::Test { - protected: - void SetUp() override { - et_pal_init(); - } - - void TearDown() override { - // Clean up tensors - for (Tensor* t : tensors_) { - delete t; - } - tensors_.clear(); - - // Clean up CUDA memory - for (void* ptr : cuda_memory_) { - if (ptr != nullptr) { - cudaFree(ptr); - } - } - cuda_memory_.clear(); - - // Clean up CPU memory - for (void* ptr : cpu_memory_) { - if (ptr != nullptr) { - free(ptr); - } - } - cpu_memory_.clear(); - } - - void* allocateCudaMemory(size_t bytes) { - void* ptr = nullptr; - cudaError_t err = cudaMalloc(&ptr, bytes); - if (err == cudaSuccess && ptr != nullptr) { - cuda_memory_.push_back(ptr); - } - return ptr; - } - - void* allocateCpuMemory(size_t bytes) { - void* ptr = nullptr; - int result = posix_memalign(&ptr, 16, bytes); - if (result == 0 && ptr != nullptr) { - cpu_memory_.push_back(ptr); - } - return ptr; - } - - void trackTensor(Tensor* t) { - if (t != nullptr) { - tensors_.push_back(t); - } - } - - private: - std::vector tensors_; - std::vector cuda_memory_; - std::vector cpu_memory_; -}; - -// ============================================================================ -// Common test body - parameterized by device type -// ============================================================================ - -void runBasicFromBlobTest( - AOTITorchCreateTensorFromBlobV2SlimTest* fixture, - void* data, - int32_t device_type, - int32_t device_index) { - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, // storage_offset - static_cast(slim_c10::ScalarType::Float), - device_type, - device_index, - &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - // Check tensor properties - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), 2); - EXPECT_EQ(tensor->size(1), 3); - EXPECT_EQ(tensor->numel(), 6); - EXPECT_EQ( - static_cast(tensor->dtype()), - static_cast(slim_c10::ScalarType::Float)); - - // Verify the tensor uses the same data pointer (non-owning) - EXPECT_EQ(tensor->data_ptr(), data); - - // Cleanup - tensor should NOT free the original memory - delete tensor; -} - -void runScalarFromBlobTest( - AOTITorchCreateTensorFromBlobV2SlimTest* fixture, - void* data, - int32_t device_type, - int32_t device_index) { - std::vector sizes = {}; // 0D tensor - std::vector strides = {}; - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, // storage_offset - static_cast(slim_c10::ScalarType::Float), - device_type, - device_index, - &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->dim(), 0); - EXPECT_EQ(tensor->numel(), 1); - EXPECT_EQ(tensor->data_ptr(), data); - - delete tensor; -} - -void runMultiDimensionalFromBlobTest( - AOTITorchCreateTensorFromBlobV2SlimTest* fixture, - void* data, - int32_t device_type, - int32_t device_index) { - std::vector sizes = {2, 3, 4}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, // storage_offset - static_cast(slim_c10::ScalarType::Float), - device_type, - device_index, - &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->dim(), 3); - EXPECT_EQ(tensor->size(0), 2); - EXPECT_EQ(tensor->size(1), 3); - EXPECT_EQ(tensor->size(2), 4); - EXPECT_EQ(tensor->numel(), 24); - EXPECT_EQ(tensor->data_ptr(), data); - - delete tensor; -} - -void runCustomStridesFromBlobTest( - AOTITorchCreateTensorFromBlobV2SlimTest* fixture, - void* data, - int32_t device_type, - int32_t device_index) { - std::vector sizes = {3, 4}; - std::vector strides = {1, 3}; // Column-major - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, // storage_offset - static_cast(slim_c10::ScalarType::Float), - device_type, - device_index, - &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->stride(0), 1); - EXPECT_EQ(tensor->stride(1), 3); - EXPECT_FALSE(tensor->is_contiguous()); - EXPECT_EQ(tensor->data_ptr(), data); - - delete tensor; -} - -void runStorageOffsetFromBlobTest( - AOTITorchCreateTensorFromBlobV2SlimTest* fixture, - void* data, - int32_t device_type, - int32_t device_index) { - std::vector sizes = {2, 2}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 2, // storage_offset = 2 elements - static_cast(slim_c10::ScalarType::Float), - device_type, - device_index, - &tensor, - 0, // layout - nullptr, // opaque_metadata - 0); // opaque_metadata_size - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->storage_offset(), 2); - // data_ptr should point to base + offset * itemsize - char* expected_ptr = static_cast(data) + 2 * sizeof(float); - EXPECT_EQ(tensor->data_ptr(), expected_ptr); - - delete tensor; -} - -// ============================================================================ -// CPU Tests -// ============================================================================ - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CPU) { - size_t bytes = 6 * sizeof(float); - void* data = allocateCpuMemory(bytes); - ASSERT_NE(data, nullptr); - - runBasicFromBlobTest( - this, data, static_cast(slim_c10::DeviceType::CPU), 0); -} - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CPU) { - size_t bytes = sizeof(float); - void* data = allocateCpuMemory(bytes); - ASSERT_NE(data, nullptr); - - runScalarFromBlobTest( - this, data, static_cast(slim_c10::DeviceType::CPU), 0); -} - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CPU) { - size_t bytes = 24 * sizeof(float); - void* data = allocateCpuMemory(bytes); - ASSERT_NE(data, nullptr); - - runMultiDimensionalFromBlobTest( - this, data, static_cast(slim_c10::DeviceType::CPU), 0); -} - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CPU) { - size_t bytes = 12 * sizeof(float); - void* data = allocateCpuMemory(bytes); - ASSERT_NE(data, nullptr); - - runCustomStridesFromBlobTest( - this, data, static_cast(slim_c10::DeviceType::CPU), 0); -} - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CPU) { - // Allocate extra space for offset - size_t bytes = 6 * sizeof(float); // 2 for offset + 4 for tensor - void* data = allocateCpuMemory(bytes); - ASSERT_NE(data, nullptr); - - runStorageOffsetFromBlobTest( - this, data, static_cast(slim_c10::DeviceType::CPU), 0); -} - -// ============================================================================ -// CUDA Tests -// ============================================================================ - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - size_t bytes = 6 * sizeof(float); - void* data = allocateCudaMemory(bytes); - ASSERT_NE(data, nullptr); - - runBasicFromBlobTest( - this, data, static_cast(slim_c10::DeviceType::CUDA), 0); -} - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - size_t bytes = sizeof(float); - void* data = allocateCudaMemory(bytes); - ASSERT_NE(data, nullptr); - - runScalarFromBlobTest( - this, data, static_cast(slim_c10::DeviceType::CUDA), 0); -} - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - size_t bytes = 24 * sizeof(float); - void* data = allocateCudaMemory(bytes); - ASSERT_NE(data, nullptr); - - runMultiDimensionalFromBlobTest( - this, data, static_cast(slim_c10::DeviceType::CUDA), 0); -} - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - size_t bytes = 12 * sizeof(float); - void* data = allocateCudaMemory(bytes); - ASSERT_NE(data, nullptr); - - runCustomStridesFromBlobTest( - this, data, static_cast(slim_c10::DeviceType::CUDA), 0); -} - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - // Allocate extra space for offset - size_t bytes = 6 * sizeof(float); - void* data = allocateCudaMemory(bytes); - ASSERT_NE(data, nullptr); - - runStorageOffsetFromBlobTest( - this, data, static_cast(slim_c10::DeviceType::CUDA), 0); -} - -// ============================================================================ -// Verify Non-Owning Behavior -// ============================================================================ - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CPU) { - size_t bytes = 6 * sizeof(float); - void* data = allocateCpuMemory(bytes); - ASSERT_NE(data, nullptr); - - // Write a pattern - float* float_data = static_cast(data); - float_data[0] = 42.0f; - - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0, - &tensor, - 0, - nullptr, - 0); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - // Delete tensor - memory should NOT be freed - delete tensor; - tensor = nullptr; - - // Memory should still be accessible - EXPECT_FLOAT_EQ(float_data[0], 42.0f); -} - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - size_t bytes = 6 * sizeof(float); - void* data = allocateCudaMemory(bytes); - ASSERT_NE(data, nullptr); - - // Write a pattern - float pattern = 42.0f; - cudaMemcpy(data, &pattern, sizeof(float), cudaMemcpyHostToDevice); - - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0, - &tensor, - 0, - nullptr, - 0); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - // Delete tensor - memory should NOT be freed - delete tensor; - tensor = nullptr; - - // Memory should still be accessible - float readback = 0.0f; - cudaError_t cuda_err = - cudaMemcpy(&readback, data, sizeof(float), cudaMemcpyDeviceToHost); - EXPECT_EQ(cuda_err, cudaSuccess); - EXPECT_FLOAT_EQ(readback, 42.0f); -} - -// ============================================================================ -// Error Cases -// ============================================================================ - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullDataPointer) { - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - nullptr, // null data - sizes.size(), - sizes.data(), - strides.data(), - 0, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0, - &tensor, - 0, - nullptr, - 0); - - EXPECT_EQ(error, Error::InvalidArgument); -} - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullReturnPointer) { - size_t bytes = 6 * sizeof(float); - void* data = allocateCpuMemory(bytes); - ASSERT_NE(data, nullptr); - - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0, - nullptr, // null return pointer - 0, - nullptr, - 0); - - EXPECT_EQ(error, Error::InvalidArgument); -} - -// ============================================================================ -// Verify Device Properties -// ============================================================================ - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCPUDevice) { - size_t bytes = 6 * sizeof(float); - void* data = allocateCpuMemory(bytes); - ASSERT_NE(data, nullptr); - - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0, - &tensor, - 0, - nullptr, - 0); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - EXPECT_TRUE(tensor->is_cpu()); - EXPECT_FALSE(tensor->is_cuda()); - EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CPU); - - delete tensor; -} - -TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCUDADevice) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - size_t bytes = 6 * sizeof(float); - void* data = allocateCudaMemory(bytes); - ASSERT_NE(data, nullptr); - - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( - data, - sizes.size(), - sizes.data(), - strides.data(), - 0, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0, - &tensor, - 0, - nullptr, - 0); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - EXPECT_FALSE(tensor->is_cpu()); - EXPECT_TRUE(tensor->is_cuda()); - EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CUDA); - - delete tensor; -} diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp index 10c8d8c1a31..e88ebb3185c 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp @@ -7,64 +7,70 @@ */ #include -#include -#include -#include -#include -#include -#include #include #include -using namespace executorch::backends::aoti; +#include +#include +#include +#include +#include + using namespace executorch::backends::cuda; -using namespace executorch::runtime; -using executorch::runtime::etensor::Tensor; +using executorch::runtime::Error; -// Test fixture for aoti_torch_delete_tensor_object tests -class AOTITorchDeleteTensorObjectTest : public ::testing::Test { - protected: - void SetUp() override { - // Initialize ExecuTorch Platform Abstraction Layer - et_pal_init(); +namespace slim_c10 = executorch::backends::aoti::slim::c10; - // Check if CUDA is available - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - if (err != cudaSuccess || device_count == 0) { - GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; - } +namespace { - // Clean up any existing cached metadata before each test - cleanup_tensor_metadata(); +bool isCudaAvailable() { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + return (err == cudaSuccess && device_count > 0); +} - // Clear any remaining tensors from previous tests - clear_all_tensors(); +std::vector calculateContiguousStrides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; + } + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; } + return strides; +} - void TearDown() override { - // Clean up metadata - cleanup_tensor_metadata(); +} // namespace - // Clear the global tensor storage using the provided function - clear_all_tensors(); +class AOTITorchDeleteTensorObjectSlimTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); } - // Helper to create test tensors - Tensor* create_test_tensor( + void TearDown() override { + // SlimTensor uses automatic reference counting - no manual cleanup needed + } + + Tensor* createTestTensor( const std::vector& sizes, const std::vector& strides = {}, - int32_t dtype = 6, // float32 - int32_t device_type = 1, // CUDA + int32_t dtype = static_cast(slim_c10::ScalarType::Float), + int32_t device_type = static_cast(slim_c10::DeviceType::CPU), int32_t device_index = 0) { - Tensor* tensor; + Tensor* tensor = nullptr; - const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data(); + std::vector effective_strides = strides; + if (strides.empty()) { + effective_strides = calculateContiguousStrides(sizes); + } AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), - strides_ptr, + effective_strides.data(), dtype, device_type, device_index, @@ -74,254 +80,241 @@ class AOTITorchDeleteTensorObjectTest : public ::testing::Test { } }; -// Test basic deletion of CUDA tensor -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteCudaTensorBasic) { - // Create a CUDA tensor +// ============================================================================ +// CPU Tests +// ============================================================================ + +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteCpuTensorBasic) { std::vector sizes = {2, 3}; - Tensor* tensor = create_test_tensor(sizes, {}, 6, 1, 0); // CUDA device + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(tensor, nullptr); - // Verify tensor properties before deletion EXPECT_EQ(tensor->dim(), 2); EXPECT_EQ(tensor->size(0), 2); EXPECT_EQ(tensor->size(1), 3); - // Delete the tensor AOTITorchError error = aoti_torch_delete_tensor_object(tensor); EXPECT_EQ(error, Error::Ok); } -// Test basic deletion of CPU tensor -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteCpuTensorBasic) { - // Create a CPU tensor - std::vector sizes = {3, 4}; - Tensor* tensor = create_test_tensor(sizes, {}, 6, 0, 0); // CPU device - ASSERT_NE(tensor, nullptr); - - // Verify tensor properties before deletion - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), 3); - EXPECT_EQ(tensor->size(1), 4); - - // Delete the tensor - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); -} - -// Test deletion of null tensor pointer -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteNullTensor) { +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteNullTensor) { AOTITorchError error = aoti_torch_delete_tensor_object(nullptr); EXPECT_EQ(error, Error::InvalidArgument); } -// Test deletion of tensor not in tracking system -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteUntrackedTensor) { - // Create a tensor and then clear the tracking system - std::vector sizes = {2, 3}; - Tensor* tensor = create_test_tensor(sizes); - ASSERT_NE(tensor, nullptr); - - // Clear the tracking system (simulating an untracked tensor) - clear_all_tensors(); - - // Try to delete the tensor - should fail - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::InvalidArgument); -} - -// Test deletion of multiple tensors -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteMultipleTensors) { - // Create multiple tensors +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMultipleTensors_CPU) { std::vector tensors; for (int i = 1; i <= 5; i++) { std::vector sizes = {i, i + 1}; - Tensor* tensor = create_test_tensor(sizes); + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(tensor, nullptr); tensors.push_back(tensor); } - // Delete all tensors for (Tensor* tensor : tensors) { AOTITorchError error = aoti_torch_delete_tensor_object(tensor); EXPECT_EQ(error, Error::Ok); } } -// Test deletion of zero-sized tensors -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteZeroSizedTensor) { - // Create a zero-sized tensor +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteZeroSizedTensor_CPU) { std::vector sizes = {0, 5}; - Tensor* tensor = create_test_tensor(sizes); + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(tensor, nullptr); - // Verify tensor properties EXPECT_EQ(tensor->dim(), 2); EXPECT_EQ(tensor->size(0), 0); EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->numel(), 0); - // Delete the tensor AOTITorchError error = aoti_torch_delete_tensor_object(tensor); EXPECT_EQ(error, Error::Ok); } -// Test deletion of scalar (0D) tensors -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteScalarTensor) { - // Create a scalar tensor +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteScalarTensor_CPU) { std::vector sizes = {}; - Tensor* tensor = create_test_tensor(sizes); + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(tensor, nullptr); - // Verify tensor properties EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->numel(), 1); - // Delete the tensor AOTITorchError error = aoti_torch_delete_tensor_object(tensor); EXPECT_EQ(error, Error::Ok); } -// Test deletion of large multi-dimensional tensors -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteLargeTensor) { - // Create a large multi-dimensional tensor +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteLargeTensor_CPU) { std::vector sizes = {10, 20, 30}; - Tensor* tensor = create_test_tensor(sizes); + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(tensor, nullptr); - // Verify tensor properties EXPECT_EQ(tensor->dim(), 3); - EXPECT_EQ(tensor->size(0), 10); - EXPECT_EQ(tensor->size(1), 20); - EXPECT_EQ(tensor->size(2), 30); + EXPECT_EQ(tensor->numel(), 6000); - // Delete the tensor AOTITorchError error = aoti_torch_delete_tensor_object(tensor); EXPECT_EQ(error, Error::Ok); } -// Test deletion of tensors with custom strides -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteTensorWithCustomStrides) { - // Create tensor with custom strides +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteTensorWithCustomStrides_CPU) { std::vector sizes = {3, 4}; - std::vector strides = {4, 1}; // Row-major strides - Tensor* tensor = create_test_tensor(sizes, strides); + std::vector strides = {1, 3}; // Column-major + Tensor* tensor = createTestTensor( + sizes, + strides, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(tensor, nullptr); - // Verify tensor properties - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), 3); - EXPECT_EQ(tensor->size(1), 4); + EXPECT_EQ(tensor->stride(0), 1); + EXPECT_EQ(tensor->stride(1), 3); - // Delete the tensor AOTITorchError error = aoti_torch_delete_tensor_object(tensor); EXPECT_EQ(error, Error::Ok); } -// Test deletion after accessing tensor data -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteAfterDataAccess) { - // Create a tensor +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteDifferentDtypes_CPU) { std::vector sizes = {2, 3}; - Tensor* tensor = create_test_tensor(sizes); - ASSERT_NE(tensor, nullptr); - // Access tensor data (this should not prevent deletion) - void* data_ptr = tensor->mutable_data_ptr(); - EXPECT_NE(data_ptr, nullptr); - - // Delete the tensor - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); -} + // Float + { + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(tensor, nullptr); + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); + } -// Test double deletion (should fail on second attempt) -TEST_F(AOTITorchDeleteTensorObjectTest, DoubleDeletion) { - // Create a tensor - std::vector sizes = {2, 3}; - Tensor* tensor = create_test_tensor(sizes); - ASSERT_NE(tensor, nullptr); + // BFloat16 + { + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::BFloat16), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(tensor, nullptr); + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); + } - // First deletion should succeed - AOTITorchError error1 = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error1, Error::Ok); + // Long + { + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Long), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(tensor, nullptr); + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); + } - // Second deletion should fail (tensor no longer tracked) - AOTITorchError error2 = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error2, Error::InvalidArgument); + // Bool + { + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Bool), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(tensor, nullptr); + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); + } } -// Test deletion of tensors on both CUDA and CPU devices -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteMixedDeviceTensors) { - // Create CUDA tensor - std::vector sizes = {2, 3}; - Tensor* cuda_tensor = create_test_tensor(sizes, {}, 6, 1, 0); - ASSERT_NE(cuda_tensor, nullptr); - - // Create CPU tensor - Tensor* cpu_tensor = create_test_tensor(sizes, {}, 6, 0, 0); - ASSERT_NE(cpu_tensor, nullptr); +// ============================================================================ +// CUDA Tests +// ============================================================================ - // Delete both tensors - AOTITorchError cuda_error = aoti_torch_delete_tensor_object(cuda_tensor); - EXPECT_EQ(cuda_error, Error::Ok); +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteCudaTensorBasic) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } - AOTITorchError cpu_error = aoti_torch_delete_tensor_object(cpu_tensor); - EXPECT_EQ(cpu_error, Error::Ok); -} + std::vector sizes = {2, 3}; + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(tensor, nullptr); -// Test memory consistency after deletion -TEST_F(AOTITorchDeleteTensorObjectTest, MemoryConsistencyAfterDeletion) { - // Create multiple tensors - std::vector tensors; - const int num_tensors = 10; + EXPECT_EQ(tensor->dim(), 2); + EXPECT_TRUE(tensor->is_cuda()); - for (int i = 0; i < num_tensors; i++) { - std::vector sizes = {i + 1, i + 2}; - Tensor* tensor = create_test_tensor(sizes); - ASSERT_NE(tensor, nullptr); - tensors.push_back(tensor); - } + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} - // Delete every other tensor - for (int i = 0; i < num_tensors; i += 2) { - AOTITorchError error = aoti_torch_delete_tensor_object(tensors[i]); - EXPECT_EQ(error, Error::Ok); +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMultipleTensors_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; } - // Delete remaining tensors - for (int i = 1; i < num_tensors; i += 2) { - AOTITorchError error = aoti_torch_delete_tensor_object(tensors[i]); - EXPECT_EQ(error, Error::Ok); - } -} - -// Test stress deletion with many small tensors -TEST_F(AOTITorchDeleteTensorObjectTest, StressDeletionManySmallTensors) { - const int num_tensors = 100; std::vector tensors; - // Create many small tensors - for (int i = 0; i < num_tensors; i++) { - std::vector sizes = {1, 1}; // Minimal size - Tensor* tensor = create_test_tensor(sizes); - if (tensor != nullptr) { - tensors.push_back(tensor); - } + for (int i = 1; i <= 5; i++) { + std::vector sizes = {i, i + 1}; + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(tensor, nullptr); + tensors.push_back(tensor); } - // Delete all created tensors for (Tensor* tensor : tensors) { AOTITorchError error = aoti_torch_delete_tensor_object(tensor); EXPECT_EQ(error, Error::Ok); } } -// Test CUDA synchronization during deletion -TEST_F(AOTITorchDeleteTensorObjectTest, CudaSynchronizationDuringDeletion) { - // Create a larger CUDA tensor to ensure memory allocation +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteLargeTensor_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + std::vector sizes = {100, 100}; - Tensor* tensor = create_test_tensor(sizes, {}, 6, 1, 0); // CUDA device + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); ASSERT_NE(tensor, nullptr); - // Delete the tensor (should handle synchronization internally) AOTITorchError error = aoti_torch_delete_tensor_object(tensor); EXPECT_EQ(error, Error::Ok); @@ -330,125 +323,63 @@ TEST_F(AOTITorchDeleteTensorObjectTest, CudaSynchronizationDuringDeletion) { EXPECT_EQ(cuda_error, cudaSuccess); } -// Test specific deletion of bfloat16 tensors -TEST_F(AOTITorchDeleteTensorObjectTest, DeleteBFloat16Tensor) { - // Test 1D bfloat16 tensor deletion - std::vector sizes_1d = {10}; - Tensor* tensor_bf16_1d = create_test_tensor( - sizes_1d, - {}, - static_cast(SupportedDTypes::BFLOAT16), - 1, // CUDA device - 0); - ASSERT_NE(tensor_bf16_1d, nullptr); - - // Verify it's bfloat16 before deletion - int32_t actual_dtype; - EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_1d, &actual_dtype), Error::Ok); - EXPECT_EQ(actual_dtype, static_cast(SupportedDTypes::BFLOAT16)) - << "Expected bfloat16 dtype (" - << static_cast(SupportedDTypes::BFLOAT16) << "), got " - << actual_dtype; - - // Verify element size (bfloat16 should be 2 bytes per element) - EXPECT_EQ(tensor_bf16_1d->element_size(), 2); - - // Delete the bfloat16 tensor - AOTITorchError error = aoti_torch_delete_tensor_object(tensor_bf16_1d); - EXPECT_EQ(error, Error::Ok); - - // Test 2D bfloat16 tensor deletion with custom strides - std::vector sizes_2d = {4, 6}; - std::vector strides_2d = {6, 1}; // Row-major strides - Tensor* tensor_bf16_2d = create_test_tensor( - sizes_2d, - strides_2d, - static_cast(SupportedDTypes::BFLOAT16), - 1, // CUDA device - 0); - ASSERT_NE(tensor_bf16_2d, nullptr); - - // Verify tensor properties - EXPECT_EQ(tensor_bf16_2d->dim(), 2); - EXPECT_EQ(tensor_bf16_2d->size(0), 4); - EXPECT_EQ(tensor_bf16_2d->size(1), 6); - EXPECT_EQ(tensor_bf16_2d->element_size(), 2); - - // Verify it's bfloat16 - int32_t dtype_2d; - EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_2d, &dtype_2d), Error::Ok); - EXPECT_EQ(dtype_2d, static_cast(SupportedDTypes::BFLOAT16)); +TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMixedDeviceTensors) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } - // Delete the 2D bfloat16 tensor - error = aoti_torch_delete_tensor_object(tensor_bf16_2d); - EXPECT_EQ(error, Error::Ok); + std::vector sizes = {2, 3}; - // Test 3D bfloat16 tensor deletion - std::vector sizes_3d = {2, 3, 4}; - Tensor* tensor_bf16_3d = create_test_tensor( - sizes_3d, + // Create CUDA tensor + Tensor* cuda_tensor = createTestTensor( + sizes, {}, - static_cast(SupportedDTypes::BFLOAT16), - 1, // CUDA device + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), 0); - ASSERT_NE(tensor_bf16_3d, nullptr); - - // Verify tensor properties - EXPECT_EQ(tensor_bf16_3d->dim(), 3); - EXPECT_EQ(tensor_bf16_3d->size(0), 2); - EXPECT_EQ(tensor_bf16_3d->size(1), 3); - EXPECT_EQ(tensor_bf16_3d->size(2), 4); - EXPECT_EQ(tensor_bf16_3d->element_size(), 2); - - // Verify memory size (2 * 3 * 4 * 2 bytes = 48 bytes) - size_t expected_memory = 2 * 3 * 4 * 2; - size_t actual_memory = - tensor_bf16_3d->numel() * tensor_bf16_3d->element_size(); - EXPECT_EQ(actual_memory, expected_memory); - - // Delete the 3D bfloat16 tensor - error = aoti_torch_delete_tensor_object(tensor_bf16_3d); - EXPECT_EQ(error, Error::Ok); + ASSERT_NE(cuda_tensor, nullptr); + EXPECT_TRUE(cuda_tensor->is_cuda()); - // Test bfloat16 scalar tensor (0D) deletion - std::vector scalar_sizes = {}; - Tensor* tensor_bf16_scalar = create_test_tensor( - scalar_sizes, + // Create CPU tensor + Tensor* cpu_tensor = createTestTensor( + sizes, {}, - static_cast(SupportedDTypes::BFLOAT16), - 1, // CUDA device + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), 0); - ASSERT_NE(tensor_bf16_scalar, nullptr); + ASSERT_NE(cpu_tensor, nullptr); + EXPECT_TRUE(cpu_tensor->is_cpu()); - // Verify scalar tensor properties - EXPECT_EQ(tensor_bf16_scalar->dim(), 0); - EXPECT_EQ(tensor_bf16_scalar->numel(), 1); - EXPECT_EQ(tensor_bf16_scalar->element_size(), 2); + // Delete both tensors + EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_tensor), Error::Ok); +} - // Delete the scalar bfloat16 tensor - error = aoti_torch_delete_tensor_object(tensor_bf16_scalar); - EXPECT_EQ(error, Error::Ok); +// ============================================================================ +// Stress Tests +// ============================================================================ - // Test zero-element bfloat16 tensor deletion - std::vector zero_sizes = {0, 5}; - Tensor* tensor_bf16_zero = create_test_tensor( - zero_sizes, - {}, - static_cast(SupportedDTypes::BFLOAT16), - 1, // CUDA device - 0); - ASSERT_NE(tensor_bf16_zero, nullptr); +TEST_F( + AOTITorchDeleteTensorObjectSlimTest, + StressDeletionManySmallTensors_CPU) { + const int num_tensors = 100; + std::vector tensors; - // Verify zero-element tensor properties - EXPECT_EQ(tensor_bf16_zero->dim(), 2); - EXPECT_EQ(tensor_bf16_zero->size(0), 0); - EXPECT_EQ(tensor_bf16_zero->size(1), 5); - EXPECT_EQ(tensor_bf16_zero->numel(), 0); - EXPECT_EQ(tensor_bf16_zero->element_size(), 2); + for (int i = 0; i < num_tensors; i++) { + std::vector sizes = {1, 1}; + Tensor* tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + if (tensor != nullptr) { + tensors.push_back(tensor); + } + } - // Delete the zero-element bfloat16 tensor - error = aoti_torch_delete_tensor_object(tensor_bf16_zero); - EXPECT_EQ(error, Error::Ok); + for (Tensor* tensor : tensors) { + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + } } - -// Test deletion of mixed dtype tensors (float32 and bfloat16) diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object_slim.cpp deleted file mode 100644 index e88ebb3185c..00000000000 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object_slim.cpp +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -using namespace executorch::backends::cuda; -using executorch::runtime::Error; - -namespace slim_c10 = executorch::backends::aoti::slim::c10; - -namespace { - -bool isCudaAvailable() { - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - return (err == cudaSuccess && device_count > 0); -} - -std::vector calculateContiguousStrides( - const std::vector& sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) { - return strides; - } - strides[sizes.size() - 1] = 1; - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - return strides; -} - -} // namespace - -class AOTITorchDeleteTensorObjectSlimTest : public ::testing::Test { - protected: - void SetUp() override { - et_pal_init(); - } - - void TearDown() override { - // SlimTensor uses automatic reference counting - no manual cleanup needed - } - - Tensor* createTestTensor( - const std::vector& sizes, - const std::vector& strides = {}, - int32_t dtype = static_cast(slim_c10::ScalarType::Float), - int32_t device_type = static_cast(slim_c10::DeviceType::CPU), - int32_t device_index = 0) { - Tensor* tensor = nullptr; - - std::vector effective_strides = strides; - if (strides.empty()) { - effective_strides = calculateContiguousStrides(sizes); - } - - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - effective_strides.data(), - dtype, - device_type, - device_index, - &tensor); - - return (error == Error::Ok) ? tensor : nullptr; - } -}; - -// ============================================================================ -// CPU Tests -// ============================================================================ - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteCpuTensorBasic) { - std::vector sizes = {2, 3}; - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), 2); - EXPECT_EQ(tensor->size(1), 3); - - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); -} - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteNullTensor) { - AOTITorchError error = aoti_torch_delete_tensor_object(nullptr); - EXPECT_EQ(error, Error::InvalidArgument); -} - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMultipleTensors_CPU) { - std::vector tensors; - - for (int i = 1; i <= 5; i++) { - std::vector sizes = {i, i + 1}; - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - tensors.push_back(tensor); - } - - for (Tensor* tensor : tensors) { - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); - } -} - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteZeroSizedTensor_CPU) { - std::vector sizes = {0, 5}; - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), 0); - EXPECT_EQ(tensor->size(1), 5); - EXPECT_EQ(tensor->numel(), 0); - - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); -} - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteScalarTensor_CPU) { - std::vector sizes = {}; - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->dim(), 0); - EXPECT_EQ(tensor->numel(), 1); - - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); -} - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteLargeTensor_CPU) { - std::vector sizes = {10, 20, 30}; - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->dim(), 3); - EXPECT_EQ(tensor->numel(), 6000); - - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); -} - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteTensorWithCustomStrides_CPU) { - std::vector sizes = {3, 4}; - std::vector strides = {1, 3}; // Column-major - Tensor* tensor = createTestTensor( - sizes, - strides, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->stride(0), 1); - EXPECT_EQ(tensor->stride(1), 3); - - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); -} - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteDifferentDtypes_CPU) { - std::vector sizes = {2, 3}; - - // Float - { - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); - } - - // BFloat16 - { - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::BFloat16), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); - } - - // Long - { - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Long), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); - } - - // Bool - { - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Bool), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); - } -} - -// ============================================================================ -// CUDA Tests -// ============================================================================ - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteCudaTensorBasic) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->dim(), 2); - EXPECT_TRUE(tensor->is_cuda()); - - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); -} - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMultipleTensors_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector tensors; - - for (int i = 1; i <= 5; i++) { - std::vector sizes = {i, i + 1}; - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(tensor, nullptr); - tensors.push_back(tensor); - } - - for (Tensor* tensor : tensors) { - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); - } -} - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteLargeTensor_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {100, 100}; - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(tensor, nullptr); - - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); - - // Verify CUDA state is still good - cudaError_t cuda_error = cudaGetLastError(); - EXPECT_EQ(cuda_error, cudaSuccess); -} - -TEST_F(AOTITorchDeleteTensorObjectSlimTest, DeleteMixedDeviceTensors) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - - // Create CUDA tensor - Tensor* cuda_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(cuda_tensor, nullptr); - EXPECT_TRUE(cuda_tensor->is_cuda()); - - // Create CPU tensor - Tensor* cpu_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(cpu_tensor, nullptr); - EXPECT_TRUE(cpu_tensor->is_cpu()); - - // Delete both tensors - EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_tensor), Error::Ok); -} - -// ============================================================================ -// Stress Tests -// ============================================================================ - -TEST_F( - AOTITorchDeleteTensorObjectSlimTest, - StressDeletionManySmallTensors_CPU) { - const int num_tensors = 100; - std::vector tensors; - - for (int i = 0; i < num_tensors; i++) { - std::vector sizes = {1, 1}; - Tensor* tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - if (tensor != nullptr) { - tensors.push_back(tensor); - } - } - - for (Tensor* tensor : tensors) { - AOTITorchError error = aoti_torch_delete_tensor_object(tensor); - EXPECT_EQ(error, Error::Ok); - } -} diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp index 799a8d1221b..d563eea98bc 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp @@ -7,661 +7,461 @@ */ #include -#include -#include -#include -#include -#include -#include #include #include +#include +#include +#include +#include +#include + using namespace executorch::backends::cuda; -using namespace executorch::backends::aoti; -using namespace executorch::runtime; -using executorch::runtime::etensor::Tensor; +using executorch::runtime::Error; -// Test fixture for aoti_torch_empty_strided tests -class AOTITorchEmptyStridedTest : public ::testing::Test { - protected: - void SetUp() override { - // Initialize ExecuTorch Platform Abstraction Layer - et_pal_init(); +namespace slim_c10 = executorch::backends::aoti::slim::c10; - // Check if CUDA is available - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - if (err != cudaSuccess || device_count == 0) { - GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; - } +namespace { - // Clean up any existing cached metadata before each test - cleanup_tensor_metadata(); +// Helper to check if CUDA is available +bool isCudaAvailable() { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + return (err == cudaSuccess && device_count > 0); +} - // Clear any remaining tensors from previous tests - clear_all_tensors(); +// Helper to calculate contiguous strides from sizes +std::vector calculateContiguousStrides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; } - - void TearDown() override { - // Clean up metadata - cleanup_tensor_metadata(); - - // Clear the global tensor storage using the provided function - clear_all_tensors(); + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; } + return strides; +} - // Helper to create test tensors - Tensor* create_tracked_tensor( - const std::vector& sizes, - const std::vector& strides = {}, - int32_t dtype = static_cast(SupportedDTypes::FLOAT32), - int32_t device_type = static_cast(SupportedDevices::CUDA), - int32_t device_index = 0) { - Tensor* tensor; +} // namespace - const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data(); +// Test fixture for SlimTensor-based aoti_torch_empty_strided tests +class AOTITorchEmptyStridedSlimTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); + } - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides_ptr, - dtype, - device_type, - device_index, - &tensor); + void TearDown() override { + // Tensors are cleaned up via their destructors + for (Tensor* t : tensors_) { + delete t; + } + tensors_.clear(); + } - return (error == Error::Ok) ? tensor : nullptr; + // Track tensors for cleanup + void trackTensor(Tensor* t) { + if (t != nullptr) { + tensors_.push_back(t); + } } + + private: + std::vector tensors_; }; -// Test aoti_torch_empty_strided basic functionality -TEST_F(AOTITorchEmptyStridedTest, BasicFunctionality) { +// ============================================================================ +// Common test body - parameterized by device type +// ============================================================================ + +void runBasicEmptyStridedTest(int32_t device_type, int32_t device_index) { // Test 1D tensor std::vector sizes_1d = {5}; - Tensor* tensor_1d; + std::vector strides_1d = calculateContiguousStrides(sizes_1d); + + Tensor* tensor_1d = nullptr; AOTITorchError error = aoti_torch_empty_strided( sizes_1d.size(), sizes_1d.data(), - nullptr, // Let function compute strides - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index + strides_1d.data(), + static_cast(slim_c10::ScalarType::Float), // dtype = 6 + device_type, + device_index, &tensor_1d); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor_1d, nullptr); - - // CRITICAL: Verify the tensor is actually float32 - int32_t actual_dtype; - EXPECT_EQ(aoti_torch_get_dtype(tensor_1d, &actual_dtype), Error::Ok); - EXPECT_EQ(actual_dtype, static_cast(SupportedDTypes::FLOAT32)) - << "Expected float32 dtype (" - << static_cast(SupportedDTypes::FLOAT32) << "), got " - << actual_dtype; - - // Verify element size (float32 should be 4 bytes per element) - size_t element_size = tensor_1d->element_size(); - EXPECT_EQ(element_size, 4) - << "Expected float32 element size to be 4 bytes, got " << element_size; - - // Verify total number of elements and memory usage - int64_t expected_numel = 5; // 5 elements - EXPECT_EQ(tensor_1d->numel(), expected_numel) - << "Expected " << expected_numel << " elements, got " - << tensor_1d->numel(); - - // Verify total memory size (numel * element_size) - size_t expected_memory_size = expected_numel * 4; // 5 * 4 = 20 bytes - size_t actual_memory_size = tensor_1d->numel() * tensor_1d->element_size(); - EXPECT_EQ(actual_memory_size, expected_memory_size) - << "Expected " << expected_memory_size << " bytes, got " - << actual_memory_size; + ASSERT_NE(tensor_1d, nullptr); // Check tensor properties EXPECT_EQ(tensor_1d->dim(), 1); EXPECT_EQ(tensor_1d->size(0), 5); + EXPECT_EQ(tensor_1d->numel(), 5); + EXPECT_EQ( + static_cast(tensor_1d->dtype()), + static_cast(slim_c10::ScalarType::Float)); + EXPECT_NE(tensor_1d->data_ptr(), nullptr); - // Test 2D tensor with explicit strides - std::vector sizes_2d = {3, 4}; - std::vector strides_2d = {4, 1}; - Tensor* tensor_2d; - error = aoti_torch_empty_strided( - sizes_2d.size(), - sizes_2d.data(), - strides_2d.data(), - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor_2d); - - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor_2d, nullptr); - - // Verify 2D tensor is also float32 - int32_t dtype_2d; - EXPECT_EQ(aoti_torch_get_dtype(tensor_2d, &dtype_2d), Error::Ok); - EXPECT_EQ(dtype_2d, static_cast(SupportedDTypes::FLOAT32)) - << "Expected float32 dtype (" - << static_cast(SupportedDTypes::FLOAT32) << "), got " - << dtype_2d; - - // Verify element size for 2D tensor - EXPECT_EQ(tensor_2d->element_size(), 4); - - // Check tensor properties - EXPECT_EQ(tensor_2d->dim(), 2); - EXPECT_EQ(tensor_2d->size(0), 3); - EXPECT_EQ(tensor_2d->size(1), 4); - - // Verify memory size for 2D tensor - int64_t expected_numel_2d = 3 * 4; // 12 elements - size_t expected_memory_2d = expected_numel_2d * 4; // 12 * 4 = 48 bytes - EXPECT_EQ(tensor_2d->numel() * tensor_2d->element_size(), expected_memory_2d); + // Cleanup + delete tensor_1d; } -// Test aoti_torch_empty_strided with CPU device -TEST_F(AOTITorchEmptyStridedTest, CPUDevice) { - std::vector sizes = {2, 3}; - Tensor* tensor; +void runMultiDimensionalEmptyStridedTest( + int32_t device_type, + int32_t device_index) { + // Test 3D tensor + std::vector sizes = {2, 3, 4}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), - nullptr, // Let function compute strides - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CPU), - 0, // device index + strides.data(), + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, &tensor); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); + ASSERT_NE(tensor, nullptr); // Check tensor properties - EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->dim(), 3); EXPECT_EQ(tensor->size(0), 2); EXPECT_EQ(tensor->size(1), 3); -} + EXPECT_EQ(tensor->size(2), 4); + EXPECT_EQ(tensor->numel(), 24); -// Test aoti_torch_empty_strided with invalid dtype -TEST_F(AOTITorchEmptyStridedTest, InvalidDtype) { - std::vector sizes = {2, 3}; - Tensor* tensor; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - nullptr, - 999, // invalid dtype - 1, // CUDA device - 0, // device index - &tensor); + // Check strides + EXPECT_EQ(tensor->stride(0), 12); + EXPECT_EQ(tensor->stride(1), 4); + EXPECT_EQ(tensor->stride(2), 1); - EXPECT_EQ(error, Error::InvalidArgument); + delete tensor; } -// Test aoti_torch_empty_strided with unsupported device -TEST_F(AOTITorchEmptyStridedTest, UnsupportedDevice) { - std::vector sizes = {2, 3}; - Tensor* tensor; +void runScalarTensorEmptyStridedTest( + int32_t device_type, + int32_t device_index) { + std::vector sizes = {}; // 0D tensor + std::vector strides = {}; + + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), - nullptr, - 6, // float32 - 2, // unsupported device type - 0, // device index + strides.data(), + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, &tensor); - EXPECT_EQ(error, Error::NotImplemented); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->numel(), 1); + EXPECT_NE(tensor->data_ptr(), nullptr); + + delete tensor; } -// Test aoti_torch_empty_strided with zero-sized tensor -TEST_F(AOTITorchEmptyStridedTest, ZeroSized) { +void runZeroSizedTensorEmptyStridedTest( + int32_t device_type, + int32_t device_index) { std::vector sizes = {0, 5}; - Tensor* tensor; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), - nullptr, - 6, // float32 - 1, // CUDA device - 0, // device index + strides.data(), + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, &tensor); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); + ASSERT_NE(tensor, nullptr); - // Check tensor properties EXPECT_EQ(tensor->dim(), 2); EXPECT_EQ(tensor->size(0), 0); EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->numel(), 0); + + delete tensor; } -// Test aoti_torch_empty_strided scalar tensor (0D) -TEST_F(AOTITorchEmptyStridedTest, Scalar) { - std::vector sizes = {}; - Tensor* tensor; +void runCustomStridesEmptyStridedTest( + int32_t device_type, + int32_t device_index) { + // Create a transposed (column-major) tensor + std::vector sizes = {3, 4}; + std::vector strides = {1, 3}; // Column-major + + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), - nullptr, - 6, // float32 - 1, // CUDA device - 0, // device index + strides.data(), + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, &tensor); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); - - // Check tensor properties - EXPECT_EQ(tensor->dim(), 0); -} + ASSERT_NE(tensor, nullptr); -// Test aoti_torch_empty_strided with large tensor -TEST_F(AOTITorchEmptyStridedTest, LargeTensor) { - std::vector sizes = {100, 200, 50}; - Tensor* tensor; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - nullptr, - 6, // float32 - 1, // CUDA device - 0, // device index - &tensor); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 3); + EXPECT_EQ(tensor->size(1), 4); + EXPECT_EQ(tensor->stride(0), 1); + EXPECT_EQ(tensor->stride(1), 3); - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); + // Non-contiguous due to custom strides + EXPECT_FALSE(tensor->is_contiguous()); - // Check tensor properties - EXPECT_EQ(tensor->dim(), 3); - EXPECT_EQ(tensor->size(0), 100); - EXPECT_EQ(tensor->size(1), 200); - EXPECT_EQ(tensor->size(2), 50); + delete tensor; } -// Test aoti_torch_empty_strided with bfloat16 dtype -TEST_F(AOTITorchEmptyStridedTest, BFloat16Tensor) { - // Test creating bfloat16 tensor on CUDA - std::vector sizes = {2, 3, 4}; - Tensor* tensor_bf16; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - nullptr, // Let function compute strides - static_cast(SupportedDTypes::BFLOAT16), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor_bf16); - - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor_bf16, nullptr); - - // CRITICAL: Verify the tensor is actually bfloat16 - int32_t actual_dtype; - EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16, &actual_dtype), Error::Ok); - EXPECT_EQ(actual_dtype, static_cast(SupportedDTypes::BFLOAT16)) - << "Expected bfloat16 dtype (" - << static_cast(SupportedDTypes::BFLOAT16) << "), got " - << actual_dtype; - - // Verify element size (bfloat16 should be 2 bytes per element) - size_t element_size = tensor_bf16->element_size(); - EXPECT_EQ(element_size, 2) - << "Expected bfloat16 element size to be 2 bytes, got " << element_size; - - // Verify total number of elements and memory usage - int64_t expected_numel = 2 * 3 * 4; // 24 elements - EXPECT_EQ(tensor_bf16->numel(), expected_numel) - << "Expected " << expected_numel << " elements, got " - << tensor_bf16->numel(); - - // Verify total memory size (numel * element_size) - size_t expected_memory_size = expected_numel * 2; // 24 * 2 = 48 bytes - size_t actual_memory_size = - tensor_bf16->numel() * tensor_bf16->element_size(); - EXPECT_EQ(actual_memory_size, expected_memory_size) - << "Expected " << expected_memory_size << " bytes, got " - << actual_memory_size; +void runDifferentDtypesEmptyStridedTest( + int32_t device_type, + int32_t device_index) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); - // Check tensor properties - EXPECT_EQ(tensor_bf16->dim(), 3); - EXPECT_EQ(tensor_bf16->size(0), 2); - EXPECT_EQ(tensor_bf16->size(1), 3); - EXPECT_EQ(tensor_bf16->size(2), 4); - - // Verify we can get tensor metadata - int64_t* sizes_ptr; - int64_t* strides_ptr; - EXPECT_EQ(aoti_torch_get_sizes(tensor_bf16, &sizes_ptr), Error::Ok); - EXPECT_EQ(aoti_torch_get_strides(tensor_bf16, &strides_ptr), Error::Ok); - - // Check sizes match - EXPECT_EQ(sizes_ptr[0], 2); - EXPECT_EQ(sizes_ptr[1], 3); - EXPECT_EQ(sizes_ptr[2], 4); - - // Check that strides are computed correctly (row-major order) - EXPECT_EQ(strides_ptr[0], 12); // 3 * 4 - EXPECT_EQ(strides_ptr[1], 4); // 4 - EXPECT_EQ(strides_ptr[2], 1); // 1 - - // Test bfloat16 tensor with custom strides - std::vector sizes_2d = {3, 2}; - std::vector strides_2d = {2, 1}; // Row-major strides - Tensor* tensor_bf16_custom; - error = aoti_torch_empty_strided( - sizes_2d.size(), - sizes_2d.data(), - strides_2d.data(), - static_cast(SupportedDTypes::BFLOAT16), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor_bf16_custom); + // Test Float32 + { + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides.data(), + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, + &tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Float); + EXPECT_EQ(tensor->itemsize(), 4); + delete tensor; + } - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor_bf16_custom, nullptr); + // Test BFloat16 + { + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides.data(), + static_cast(slim_c10::ScalarType::BFloat16), + device_type, + device_index, + &tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::BFloat16); + EXPECT_EQ(tensor->itemsize(), 2); + delete tensor; + } - // Verify custom stride tensor is also bfloat16 - int32_t custom_dtype; - EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_custom, &custom_dtype), Error::Ok); - EXPECT_EQ(custom_dtype, static_cast(SupportedDTypes::BFLOAT16)) - << "Expected bfloat16 dtype (" - << static_cast(SupportedDTypes::BFLOAT16) << "), got " - << custom_dtype; + // Test Int64 + { + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides.data(), + static_cast(slim_c10::ScalarType::Long), + device_type, + device_index, + &tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Long); + EXPECT_EQ(tensor->itemsize(), 8); + delete tensor; + } - // Verify element size for custom stride tensor - EXPECT_EQ(tensor_bf16_custom->element_size(), 2); + // Test Bool + { + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides.data(), + static_cast(slim_c10::ScalarType::Bool), + device_type, + device_index, + &tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Bool); + EXPECT_EQ(tensor->itemsize(), 1); + delete tensor; + } +} - // Check tensor properties - EXPECT_EQ(tensor_bf16_custom->dim(), 2); - EXPECT_EQ(tensor_bf16_custom->size(0), 3); - EXPECT_EQ(tensor_bf16_custom->size(1), 2); +// ============================================================================ +// CPU Tests +// ============================================================================ - // Verify memory size for custom stride tensor - int64_t custom_expected_numel = 3 * 2; // 6 elements - size_t custom_expected_memory = custom_expected_numel * 2; // 6 * 2 = 12 bytes - EXPECT_EQ( - tensor_bf16_custom->numel() * tensor_bf16_custom->element_size(), - custom_expected_memory); +TEST_F(AOTITorchEmptyStridedSlimTest, BasicFunctionality_CPU) { + runBasicEmptyStridedTest(static_cast(slim_c10::DeviceType::CPU), 0); +} - // Check custom strides - int64_t* custom_strides_ptr; - EXPECT_EQ( - aoti_torch_get_strides(tensor_bf16_custom, &custom_strides_ptr), - Error::Ok); - EXPECT_EQ(custom_strides_ptr[0], 2); - EXPECT_EQ(custom_strides_ptr[1], 1); - - // Test bfloat16 scalar tensor (0D) - std::vector scalar_sizes = {}; - Tensor* tensor_bf16_scalar; - error = aoti_torch_empty_strided( - scalar_sizes.size(), - scalar_sizes.data(), - nullptr, - static_cast(SupportedDTypes::BFLOAT16), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor_bf16_scalar); +TEST_F(AOTITorchEmptyStridedSlimTest, MultiDimensional_CPU) { + runMultiDimensionalEmptyStridedTest( + static_cast(slim_c10::DeviceType::CPU), 0); +} - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor_bf16_scalar, nullptr); - EXPECT_EQ(tensor_bf16_scalar->dim(), 0); - - // Verify scalar tensor is also bfloat16 - int32_t scalar_dtype; - EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_scalar, &scalar_dtype), Error::Ok); - EXPECT_EQ(scalar_dtype, static_cast(SupportedDTypes::BFLOAT16)) - << "Expected bfloat16 dtype (" - << static_cast(SupportedDTypes::BFLOAT16) << "), got " - << scalar_dtype; - - // Verify scalar tensor properties - EXPECT_EQ(tensor_bf16_scalar->element_size(), 2); - EXPECT_EQ(tensor_bf16_scalar->numel(), 1); // Scalar tensor has 1 element - EXPECT_EQ( - tensor_bf16_scalar->numel() * tensor_bf16_scalar->element_size(), - 2); // 1 * 2 = 2 bytes +TEST_F(AOTITorchEmptyStridedSlimTest, ScalarTensor_CPU) { + runScalarTensorEmptyStridedTest( + static_cast(slim_c10::DeviceType::CPU), 0); } -// Test custom strides functionality -TEST_F(AOTITorchEmptyStridedTest, CustomStrides) { - // Create tensor with valid custom strides (contiguous layout) - std::vector sizes = {2, 3}; - std::vector strides = {3, 1}; // Standard row-major strides +TEST_F(AOTITorchEmptyStridedSlimTest, ZeroSizedTensor_CPU) { + runZeroSizedTensorEmptyStridedTest( + static_cast(slim_c10::DeviceType::CPU), 0); +} - Tensor* tensor = create_tracked_tensor(sizes, strides); - EXPECT_NE(tensor, nullptr); +TEST_F(AOTITorchEmptyStridedSlimTest, CustomStrides_CPU) { + runCustomStridesEmptyStridedTest( + static_cast(slim_c10::DeviceType::CPU), 0); +} - // Verify the tensor was created correctly - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), 2); - EXPECT_EQ(tensor->size(1), 3); +TEST_F(AOTITorchEmptyStridedSlimTest, DifferentDtypes_CPU) { + runDifferentDtypesEmptyStridedTest( + static_cast(slim_c10::DeviceType::CPU), 0); +} - // Check strides through AOTI interface - int64_t* strides_ptr; - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok); - EXPECT_EQ(strides_ptr[0], 3); - EXPECT_EQ(strides_ptr[1], 1); +// ============================================================================ +// CUDA Tests +// ============================================================================ - // Test another valid stride pattern - transpose-like - std::vector sizes_2 = {3, 2}; - std::vector strides_2 = {1, 3}; // Column-major strides +TEST_F(AOTITorchEmptyStridedSlimTest, BasicFunctionality_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runBasicEmptyStridedTest(static_cast(slim_c10::DeviceType::CUDA), 0); +} - Tensor* tensor_2 = create_tracked_tensor(sizes_2, strides_2); - EXPECT_NE(tensor_2, nullptr); +TEST_F(AOTITorchEmptyStridedSlimTest, MultiDimensional_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runMultiDimensionalEmptyStridedTest( + static_cast(slim_c10::DeviceType::CUDA), 0); +} - // Verify the tensor properties - EXPECT_EQ(tensor_2->dim(), 2); - EXPECT_EQ(tensor_2->size(0), 3); - EXPECT_EQ(tensor_2->size(1), 2); +TEST_F(AOTITorchEmptyStridedSlimTest, ScalarTensor_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runScalarTensorEmptyStridedTest( + static_cast(slim_c10::DeviceType::CUDA), 0); +} - // Check strides - int64_t* strides_ptr_2; - EXPECT_EQ(aoti_torch_get_strides(tensor_2, &strides_ptr_2), Error::Ok); - EXPECT_EQ(strides_ptr_2[0], 1); - EXPECT_EQ(strides_ptr_2[1], 3); +TEST_F(AOTITorchEmptyStridedSlimTest, ZeroSizedTensor_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runZeroSizedTensorEmptyStridedTest( + static_cast(slim_c10::DeviceType::CUDA), 0); } -// Test edge case: zero-element tensor with non-zero dimensions -TEST_F(AOTITorchEmptyStridedTest, ZeroElementTensor) { - std::vector sizes = {2, 0, 3}; // Total elements = 0 - Tensor* tensor = create_tracked_tensor(sizes); - EXPECT_NE(tensor, nullptr); +TEST_F(AOTITorchEmptyStridedSlimTest, CustomStrides_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runCustomStridesEmptyStridedTest( + static_cast(slim_c10::DeviceType::CUDA), 0); +} - // Verify the tensor properties - EXPECT_EQ(tensor->dim(), 3); - EXPECT_EQ(tensor->size(0), 2); - EXPECT_EQ(tensor->size(1), 0); - EXPECT_EQ(tensor->size(2), 3); - - // Should be able to get metadata - int64_t* sizes_ptr; - int64_t* strides_ptr; - EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr), Error::Ok); - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok); - - EXPECT_EQ(sizes_ptr[0], 2); - EXPECT_EQ(sizes_ptr[1], 0); - EXPECT_EQ(sizes_ptr[2], 3); +TEST_F(AOTITorchEmptyStridedSlimTest, DifferentDtypes_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + runDifferentDtypesEmptyStridedTest( + static_cast(slim_c10::DeviceType::CUDA), 0); } -// Test different data types (currently we support bf16, fp32 and int32) -TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) { +// ============================================================================ +// Verify Device Properties +// ============================================================================ + +TEST_F(AOTITorchEmptyStridedSlimTest, VerifyCPUDevice) { std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); - // Test float32 (dtype 6) - one of the supported types - Tensor* tensor_float32; + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), - nullptr, - 6, // float32 - 1, // CUDA device - 0, // device index - &tensor_float32); - - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor_float32, nullptr); - - // Test int32 (dtype 3) - one of the supported types - Tensor* tensor_int32; - error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - nullptr, - 3, // int32 - unsupported - 1, // CUDA device - 0, // device index - &tensor_int32); + strides.data(), + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0, + &tensor); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor_int32, nullptr); + ASSERT_NE(tensor, nullptr); - // Test another unsupported data type - Tensor* tensor_float64; - error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - nullptr, - 7, // float64 - unsupported - 1, // CUDA device - 0, // device index - &tensor_float64); + EXPECT_TRUE(tensor->is_cpu()); + EXPECT_FALSE(tensor->is_cuda()); + EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CPU); - EXPECT_EQ(error, Error::InvalidArgument); // Should fail for unsupported dtype + delete tensor; } -// Test multi-dimensional tensors with various shapes -TEST_F(AOTITorchEmptyStridedTest, MultiDimensionalTensors) { - // Test 3D tensor - std::vector sizes_3d = {2, 3, 4}; - Tensor* tensor_3d = create_tracked_tensor(sizes_3d); - EXPECT_NE(tensor_3d, nullptr); - EXPECT_EQ(tensor_3d->dim(), 3); - EXPECT_EQ(tensor_3d->size(0), 2); - EXPECT_EQ(tensor_3d->size(1), 3); - EXPECT_EQ(tensor_3d->size(2), 4); - - // Test 4D tensor - std::vector sizes_4d = {2, 3, 4, 5}; - Tensor* tensor_4d = create_tracked_tensor(sizes_4d); - EXPECT_NE(tensor_4d, nullptr); - EXPECT_EQ(tensor_4d->dim(), 4); - EXPECT_EQ(tensor_4d->size(0), 2); - EXPECT_EQ(tensor_4d->size(1), 3); - EXPECT_EQ(tensor_4d->size(2), 4); - EXPECT_EQ(tensor_4d->size(3), 5); - - // Test 5D tensor - std::vector sizes_5d = {1, 2, 3, 4, 5}; - Tensor* tensor_5d = create_tracked_tensor(sizes_5d); - EXPECT_NE(tensor_5d, nullptr); - EXPECT_EQ(tensor_5d->dim(), 5); - EXPECT_EQ(tensor_5d->size(0), 1); - EXPECT_EQ(tensor_5d->size(1), 2); - EXPECT_EQ(tensor_5d->size(2), 3); - EXPECT_EQ(tensor_5d->size(3), 4); - EXPECT_EQ(tensor_5d->size(4), 5); -} +TEST_F(AOTITorchEmptyStridedSlimTest, VerifyCUDADevice) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } -// Test incontiguous tensor creation - transpose-like layout -TEST_F(AOTITorchEmptyStridedTest, IncontiguousTransposeLayout) { - // Create a tensor with transpose-like strides (column-major) - // For a 3x4 tensor in column-major order, strides should be [1, 3] - // This means each row step is 1, and each column step is 3 - std::vector sizes = {3, 4}; - std::vector strides = {1, 3}; // Column-major (incontiguous) + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); - Tensor* tensor; + Tensor* tensor = nullptr; AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), strides.data(), - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0, &tensor); EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); + ASSERT_NE(tensor, nullptr); - // Verify tensor properties - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), 3); - EXPECT_EQ(tensor->size(1), 4); + EXPECT_FALSE(tensor->is_cpu()); + EXPECT_TRUE(tensor->is_cuda()); + EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CUDA); - // Verify the strides are what we specified - int64_t* strides_ptr; - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok); - EXPECT_EQ(strides_ptr[0], 1); // Column-major stride for dimension 0 - EXPECT_EQ(strides_ptr[1], 3); // Column-major stride for dimension 1 - - // Verify that memory was allocated correctly for incontiguous layout - // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] - - // 1) + 1 = 1 * (3 - 1) + 3 * (4 - 1) + 1 = 1 * 2 + 3 * 3 + 1 = 2 + 9 + 1 = 12 - // elements Total bytes = 12 * 4 = 48 bytes (for float32) - EXPECT_EQ(tensor->numel(), 12); // numel is still 3*4=12 for logical shape - - // The tensor should be accessible and writable - void* data_ptr = tensor->mutable_data_ptr(); - EXPECT_NE(data_ptr, nullptr); - - // Verify we can use CUDA to write to the memory - std::vector test_data(12, 1.0f); - cudaError_t cuda_err = cudaMemcpy( - data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice); - EXPECT_EQ(cuda_err, cudaSuccess); + delete tensor; } -// Test incontiguous tensor creation - expanded/broadcasted stride pattern -TEST_F(AOTITorchEmptyStridedTest, IncontiguousExpandedStrides) { - // Create a tensor with expanded strides (simulating broadcasting) - // A 2x3x4 tensor where the first dimension has stride 0 (expanded) - // This creates a tensor where the first dimension is "broadcasted" - std::vector sizes = {2, 3, 4}; - std::vector strides = {0, 4, 1}; // First dimension has stride 0 +// ============================================================================ +// Error Cases +// ============================================================================ + +TEST_F(AOTITorchEmptyStridedSlimTest, NullReturnPointer) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); - Tensor* tensor; AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), strides.data(), - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA), - 0, // device index - &tensor); + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0, + nullptr); // null return pointer - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(tensor, nullptr); - - // Verify tensor properties - EXPECT_EQ(tensor->dim(), 3); - EXPECT_EQ(tensor->size(0), 2); - EXPECT_EQ(tensor->size(1), 3); - EXPECT_EQ(tensor->size(2), 4); - - // Verify the strides are what we specified - int64_t* strides_ptr; - EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok); - EXPECT_EQ(strides_ptr[0], 0); // Expanded dimension stride - EXPECT_EQ(strides_ptr[1], 4); - EXPECT_EQ(strides_ptr[2], 1); - - // Verify that memory was allocated correctly for this incontiguous layout - // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] - - // 1) + stride[2] * (size[2] - 1) + 1 = 0 * (2 - 1) + 4 * (3 - 1) + 1 * (4 - - // 1) + 1 = 0 + 8 + 3 + 1 = 12 elements Note: numel() returns logical number - // of elements (2*3*4=24), not storage size - EXPECT_EQ(tensor->numel(), 24); // Logical numel is 2*3*4=24 - - // The tensor should be accessible and writable - void* data_ptr = tensor->mutable_data_ptr(); - EXPECT_NE(data_ptr, nullptr); - - // Verify we can use CUDA to write to the allocated memory - // We only need to allocate 12 elements (storage size), not 24 - std::vector test_data(12, 2.0f); - cudaError_t cuda_err = cudaMemcpy( - data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice); - EXPECT_EQ(cuda_err, cudaSuccess); + EXPECT_EQ(error, Error::InvalidArgument); } diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided_slim.cpp deleted file mode 100644 index d563eea98bc..00000000000 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided_slim.cpp +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -using namespace executorch::backends::cuda; -using executorch::runtime::Error; - -namespace slim_c10 = executorch::backends::aoti::slim::c10; - -namespace { - -// Helper to check if CUDA is available -bool isCudaAvailable() { - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - return (err == cudaSuccess && device_count > 0); -} - -// Helper to calculate contiguous strides from sizes -std::vector calculateContiguousStrides( - const std::vector& sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) { - return strides; - } - strides[sizes.size() - 1] = 1; - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - return strides; -} - -} // namespace - -// Test fixture for SlimTensor-based aoti_torch_empty_strided tests -class AOTITorchEmptyStridedSlimTest : public ::testing::Test { - protected: - void SetUp() override { - et_pal_init(); - } - - void TearDown() override { - // Tensors are cleaned up via their destructors - for (Tensor* t : tensors_) { - delete t; - } - tensors_.clear(); - } - - // Track tensors for cleanup - void trackTensor(Tensor* t) { - if (t != nullptr) { - tensors_.push_back(t); - } - } - - private: - std::vector tensors_; -}; - -// ============================================================================ -// Common test body - parameterized by device type -// ============================================================================ - -void runBasicEmptyStridedTest(int32_t device_type, int32_t device_index) { - // Test 1D tensor - std::vector sizes_1d = {5}; - std::vector strides_1d = calculateContiguousStrides(sizes_1d); - - Tensor* tensor_1d = nullptr; - AOTITorchError error = aoti_torch_empty_strided( - sizes_1d.size(), - sizes_1d.data(), - strides_1d.data(), - static_cast(slim_c10::ScalarType::Float), // dtype = 6 - device_type, - device_index, - &tensor_1d); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor_1d, nullptr); - - // Check tensor properties - EXPECT_EQ(tensor_1d->dim(), 1); - EXPECT_EQ(tensor_1d->size(0), 5); - EXPECT_EQ(tensor_1d->numel(), 5); - EXPECT_EQ( - static_cast(tensor_1d->dtype()), - static_cast(slim_c10::ScalarType::Float)); - EXPECT_NE(tensor_1d->data_ptr(), nullptr); - - // Cleanup - delete tensor_1d; -} - -void runMultiDimensionalEmptyStridedTest( - int32_t device_type, - int32_t device_index) { - // Test 3D tensor - std::vector sizes = {2, 3, 4}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::Float), - device_type, - device_index, - &tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - // Check tensor properties - EXPECT_EQ(tensor->dim(), 3); - EXPECT_EQ(tensor->size(0), 2); - EXPECT_EQ(tensor->size(1), 3); - EXPECT_EQ(tensor->size(2), 4); - EXPECT_EQ(tensor->numel(), 24); - - // Check strides - EXPECT_EQ(tensor->stride(0), 12); - EXPECT_EQ(tensor->stride(1), 4); - EXPECT_EQ(tensor->stride(2), 1); - - delete tensor; -} - -void runScalarTensorEmptyStridedTest( - int32_t device_type, - int32_t device_index) { - std::vector sizes = {}; // 0D tensor - std::vector strides = {}; - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::Float), - device_type, - device_index, - &tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->dim(), 0); - EXPECT_EQ(tensor->numel(), 1); - EXPECT_NE(tensor->data_ptr(), nullptr); - - delete tensor; -} - -void runZeroSizedTensorEmptyStridedTest( - int32_t device_type, - int32_t device_index) { - std::vector sizes = {0, 5}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::Float), - device_type, - device_index, - &tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), 0); - EXPECT_EQ(tensor->size(1), 5); - EXPECT_EQ(tensor->numel(), 0); - - delete tensor; -} - -void runCustomStridesEmptyStridedTest( - int32_t device_type, - int32_t device_index) { - // Create a transposed (column-major) tensor - std::vector sizes = {3, 4}; - std::vector strides = {1, 3}; // Column-major - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::Float), - device_type, - device_index, - &tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - EXPECT_EQ(tensor->dim(), 2); - EXPECT_EQ(tensor->size(0), 3); - EXPECT_EQ(tensor->size(1), 4); - EXPECT_EQ(tensor->stride(0), 1); - EXPECT_EQ(tensor->stride(1), 3); - - // Non-contiguous due to custom strides - EXPECT_FALSE(tensor->is_contiguous()); - - delete tensor; -} - -void runDifferentDtypesEmptyStridedTest( - int32_t device_type, - int32_t device_index) { - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - - // Test Float32 - { - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::Float), - device_type, - device_index, - &tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Float); - EXPECT_EQ(tensor->itemsize(), 4); - delete tensor; - } - - // Test BFloat16 - { - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::BFloat16), - device_type, - device_index, - &tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::BFloat16); - EXPECT_EQ(tensor->itemsize(), 2); - delete tensor; - } - - // Test Int64 - { - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::Long), - device_type, - device_index, - &tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Long); - EXPECT_EQ(tensor->itemsize(), 8); - delete tensor; - } - - // Test Bool - { - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::Bool), - device_type, - device_index, - &tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - EXPECT_EQ(tensor->dtype(), slim_c10::ScalarType::Bool); - EXPECT_EQ(tensor->itemsize(), 1); - delete tensor; - } -} - -// ============================================================================ -// CPU Tests -// ============================================================================ - -TEST_F(AOTITorchEmptyStridedSlimTest, BasicFunctionality_CPU) { - runBasicEmptyStridedTest(static_cast(slim_c10::DeviceType::CPU), 0); -} - -TEST_F(AOTITorchEmptyStridedSlimTest, MultiDimensional_CPU) { - runMultiDimensionalEmptyStridedTest( - static_cast(slim_c10::DeviceType::CPU), 0); -} - -TEST_F(AOTITorchEmptyStridedSlimTest, ScalarTensor_CPU) { - runScalarTensorEmptyStridedTest( - static_cast(slim_c10::DeviceType::CPU), 0); -} - -TEST_F(AOTITorchEmptyStridedSlimTest, ZeroSizedTensor_CPU) { - runZeroSizedTensorEmptyStridedTest( - static_cast(slim_c10::DeviceType::CPU), 0); -} - -TEST_F(AOTITorchEmptyStridedSlimTest, CustomStrides_CPU) { - runCustomStridesEmptyStridedTest( - static_cast(slim_c10::DeviceType::CPU), 0); -} - -TEST_F(AOTITorchEmptyStridedSlimTest, DifferentDtypes_CPU) { - runDifferentDtypesEmptyStridedTest( - static_cast(slim_c10::DeviceType::CPU), 0); -} - -// ============================================================================ -// CUDA Tests -// ============================================================================ - -TEST_F(AOTITorchEmptyStridedSlimTest, BasicFunctionality_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runBasicEmptyStridedTest(static_cast(slim_c10::DeviceType::CUDA), 0); -} - -TEST_F(AOTITorchEmptyStridedSlimTest, MultiDimensional_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runMultiDimensionalEmptyStridedTest( - static_cast(slim_c10::DeviceType::CUDA), 0); -} - -TEST_F(AOTITorchEmptyStridedSlimTest, ScalarTensor_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runScalarTensorEmptyStridedTest( - static_cast(slim_c10::DeviceType::CUDA), 0); -} - -TEST_F(AOTITorchEmptyStridedSlimTest, ZeroSizedTensor_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runZeroSizedTensorEmptyStridedTest( - static_cast(slim_c10::DeviceType::CUDA), 0); -} - -TEST_F(AOTITorchEmptyStridedSlimTest, CustomStrides_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runCustomStridesEmptyStridedTest( - static_cast(slim_c10::DeviceType::CUDA), 0); -} - -TEST_F(AOTITorchEmptyStridedSlimTest, DifferentDtypes_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - runDifferentDtypesEmptyStridedTest( - static_cast(slim_c10::DeviceType::CUDA), 0); -} - -// ============================================================================ -// Verify Device Properties -// ============================================================================ - -TEST_F(AOTITorchEmptyStridedSlimTest, VerifyCPUDevice) { - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0, - &tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - EXPECT_TRUE(tensor->is_cpu()); - EXPECT_FALSE(tensor->is_cuda()); - EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CPU); - - delete tensor; -} - -TEST_F(AOTITorchEmptyStridedSlimTest, VerifyCUDADevice) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - - Tensor* tensor = nullptr; - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0, - &tensor); - - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(tensor, nullptr); - - EXPECT_FALSE(tensor->is_cpu()); - EXPECT_TRUE(tensor->is_cuda()); - EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CUDA); - - delete tensor; -} - -// ============================================================================ -// Error Cases -// ============================================================================ - -TEST_F(AOTITorchEmptyStridedSlimTest, NullReturnPointer) { - std::vector sizes = {2, 3}; - std::vector strides = calculateContiguousStrides(sizes); - - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0, - nullptr); // null return pointer - - EXPECT_EQ(error, Error::InvalidArgument); -} diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool.cpp index 8e6bcbbfad6..dee95cbafe2 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool.cpp @@ -7,197 +7,285 @@ */ #include -#include -#include -#include -#include -#include -#include #include #include -using namespace executorch::backends::aoti; -using namespace executorch::backends::cuda; -using namespace executorch::runtime; -using executorch::runtime::etensor::Tensor; +#include +#include +#include +#include +#include -// Test fixture for aoti_torch_item_bool tests -class AOTITorchItemBoolTest : public ::testing::Test { - protected: - void SetUp() override { - // Initialize ExecuTorch Platform Abstraction Layer - et_pal_init(); +using namespace executorch::backends::cuda; +using executorch::runtime::Error; - // Check if CUDA is available - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - if (err != cudaSuccess || device_count == 0) { - GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; - } +namespace slim_c10 = executorch::backends::aoti::slim::c10; - // Clean up any existing cached metadata before each test - cleanup_tensor_metadata(); +namespace { - // Clear any remaining tensors from previous tests - clear_all_tensors(); - } +bool isCudaAvailable() { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + return (err == cudaSuccess && device_count > 0); +} - void TearDown() override { - // Clean up metadata - cleanup_tensor_metadata(); +} // namespace - // Clear the global tensor storage using the provided function - clear_all_tensors(); +class AOTITorchItemBoolSlimTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); } - // Helper to create a bool tensor on CUDA with a specific value - Tensor* create_cuda_bool_tensor(bool value) { - // Create a 0D (scalar) bool tensor - std::vector sizes = {}; // 0D tensor - std::vector strides = {}; // Empty strides for scalar - Tensor* tensor; + Tensor* createScalarBoolTensor( + bool value, + int32_t device_type = static_cast(slim_c10::DeviceType::CPU), + int32_t device_index = 0) { + Tensor* tensor = nullptr; + + std::vector sizes = {1}; + std::vector strides = {1}; AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), strides.data(), - static_cast(SupportedDTypes::BOOL), - static_cast(SupportedDevices::CUDA), - 0, + static_cast(slim_c10::ScalarType::Bool), + device_type, + device_index, &tensor); if (error != Error::Ok || tensor == nullptr) { return nullptr; } - // Set the value - bool host_value = value; - cudaError_t cuda_err = cudaMemcpy( - tensor->mutable_data_ptr(), - &host_value, - sizeof(bool), - cudaMemcpyHostToDevice); - - if (cuda_err != cudaSuccess) { - aoti_torch_delete_tensor_object(tensor); - return nullptr; + if (device_type == static_cast(slim_c10::DeviceType::CPU)) { + bool* data = static_cast(tensor->data_ptr()); + *data = value; + } else { + cudaMemcpy( + tensor->data_ptr(), &value, sizeof(bool), cudaMemcpyHostToDevice); } return tensor; } - // Helper to create a bool tensor on CPU with a specific value - Tensor* create_cpu_bool_tensor(bool value) { - // Create a 0D (scalar) bool tensor - std::vector sizes = {}; // 0D tensor - std::vector strides = {}; // Empty strides for scalar - Tensor* tensor; + Tensor* createTestTensor( + const std::vector& sizes, + int32_t dtype = static_cast(slim_c10::ScalarType::Float), + int32_t device_type = static_cast(slim_c10::DeviceType::CPU), + int32_t device_index = 0) { + Tensor* tensor = nullptr; + + std::vector strides(sizes.size()); + if (!sizes.empty()) { + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + } AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), strides.data(), - static_cast(SupportedDTypes::BOOL), - static_cast(SupportedDevices::CPU), - 0, + dtype, + device_type, + device_index, &tensor); - if (error != Error::Ok || tensor == nullptr) { - return nullptr; - } - - // Set the value directly - bool* data_ptr = static_cast(tensor->mutable_data_ptr()); - *data_ptr = value; - - return tensor; + return (error == Error::Ok) ? tensor : nullptr; } }; -// Test extracting true value from CUDA bool tensor -TEST_F(AOTITorchItemBoolTest, CUDATensorTrueValue) { - Tensor* tensor = create_cuda_bool_tensor(true); +// ============================================================================ +// Basic Functionality Tests +// ============================================================================ + +TEST_F(AOTITorchItemBoolSlimTest, TrueValue_CPU) { + Tensor* tensor = createScalarBoolTensor( + true, static_cast(slim_c10::DeviceType::CPU), 0); ASSERT_NE(tensor, nullptr); bool result = false; AOTITorchError error = aoti_torch_item_bool(tensor, &result); EXPECT_EQ(error, Error::Ok); - EXPECT_TRUE(result); + EXPECT_EQ(result, true); + + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); } -// Test extracting false value from CUDA bool tensor -TEST_F(AOTITorchItemBoolTest, CUDATensorFalseValue) { - Tensor* tensor = create_cuda_bool_tensor(false); +TEST_F(AOTITorchItemBoolSlimTest, FalseValue_CPU) { + Tensor* tensor = createScalarBoolTensor( + false, static_cast(slim_c10::DeviceType::CPU), 0); ASSERT_NE(tensor, nullptr); bool result = true; AOTITorchError error = aoti_torch_item_bool(tensor, &result); EXPECT_EQ(error, Error::Ok); - EXPECT_FALSE(result); + EXPECT_EQ(result, false); + + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); +} + +// ============================================================================ +// Error Handling Tests +// ============================================================================ + +TEST_F(AOTITorchItemBoolSlimTest, NullTensor) { + bool result = false; + AOTITorchError error = aoti_torch_item_bool(nullptr, &result); + + EXPECT_EQ(error, Error::InvalidArgument); +} + +TEST_F(AOTITorchItemBoolSlimTest, NullReturnValue) { + Tensor* tensor = createScalarBoolTensor( + true, static_cast(slim_c10::DeviceType::CPU), 0); + ASSERT_NE(tensor, nullptr); + + AOTITorchError error = aoti_torch_item_bool(tensor, nullptr); + + EXPECT_EQ(error, Error::InvalidArgument); + + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); +} + +TEST_F(AOTITorchItemBoolSlimTest, MultiElementTensor) { + std::vector sizes = {2, 3}; + Tensor* tensor = createTestTensor( + sizes, + static_cast(slim_c10::ScalarType::Bool), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(tensor, nullptr); + EXPECT_GT(tensor->numel(), 1); + + bool result = false; + AOTITorchError error = aoti_torch_item_bool(tensor, &result); + + EXPECT_EQ(error, Error::InvalidArgument); + + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); +} + +TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Float) { + std::vector sizes = {1}; + Tensor* tensor = createTestTensor( + sizes, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(tensor, nullptr); + + bool result = false; + AOTITorchError error = aoti_torch_item_bool(tensor, &result); + + EXPECT_EQ(error, Error::InvalidArgument); + + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); +} + +TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Long) { + std::vector sizes = {1}; + Tensor* tensor = createTestTensor( + sizes, + static_cast(slim_c10::ScalarType::Long), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(tensor, nullptr); + + bool result = false; + AOTITorchError error = aoti_torch_item_bool(tensor, &result); + + EXPECT_EQ(error, Error::InvalidArgument); + + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); } -// Test extracting true value from CPU bool tensor -TEST_F(AOTITorchItemBoolTest, CPUTensorTrueValue) { - Tensor* tensor = create_cpu_bool_tensor(true); +// ============================================================================ +// CUDA Tests +// ============================================================================ + +TEST_F(AOTITorchItemBoolSlimTest, TrueValue_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + Tensor* tensor = createScalarBoolTensor( + true, static_cast(slim_c10::DeviceType::CUDA), 0); ASSERT_NE(tensor, nullptr); + EXPECT_TRUE(tensor->is_cuda()); bool result = false; AOTITorchError error = aoti_torch_item_bool(tensor, &result); EXPECT_EQ(error, Error::Ok); - EXPECT_TRUE(result); + EXPECT_EQ(result, true); + + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); } -// Test extracting false value from CPU bool tensor -TEST_F(AOTITorchItemBoolTest, CPUTensorFalseValue) { - Tensor* tensor = create_cpu_bool_tensor(false); +TEST_F(AOTITorchItemBoolSlimTest, FalseValue_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + Tensor* tensor = createScalarBoolTensor( + false, static_cast(slim_c10::DeviceType::CUDA), 0); ASSERT_NE(tensor, nullptr); + EXPECT_TRUE(tensor->is_cuda()); bool result = true; AOTITorchError error = aoti_torch_item_bool(tensor, &result); EXPECT_EQ(error, Error::Ok); - EXPECT_FALSE(result); -} + EXPECT_EQ(result, false); -// Test with null tensor pointer -TEST_F(AOTITorchItemBoolTest, NullTensorPointer) { - bool result; - AOTITorchError error = aoti_torch_item_bool(nullptr, &result); - EXPECT_EQ(error, Error::InvalidArgument); + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); } -// Test with null result pointer -TEST_F(AOTITorchItemBoolTest, NullResultPointer) { - Tensor* tensor = create_cuda_bool_tensor(true); +TEST_F(AOTITorchItemBoolSlimTest, MultiElementTensor_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {2, 3}; + Tensor* tensor = createTestTensor( + sizes, + static_cast(slim_c10::ScalarType::Bool), + static_cast(slim_c10::DeviceType::CUDA), + 0); ASSERT_NE(tensor, nullptr); + EXPECT_TRUE(tensor->is_cuda()); + + bool result = false; + AOTITorchError error = aoti_torch_item_bool(tensor, &result); - AOTITorchError error = aoti_torch_item_bool(tensor, nullptr); EXPECT_EQ(error, Error::InvalidArgument); + + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); } -// Test with non-bool dtype (should fail) -TEST_F(AOTITorchItemBoolTest, NonBoolDtype) { - // Create a float tensor - std::vector sizes = {}; - std::vector strides = {}; - Tensor* tensor; - - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(SupportedDTypes::FLOAT32), // Not bool - static_cast(SupportedDevices::CUDA), - 0, - &tensor); - - ASSERT_EQ(error, Error::Ok); +TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Float_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {1}; + Tensor* tensor = createTestTensor( + sizes, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); ASSERT_NE(tensor, nullptr); - bool result; - error = aoti_torch_item_bool(tensor, &result); + bool result = false; + AOTITorchError error = aoti_torch_item_bool(tensor, &result); + EXPECT_EQ(error, Error::InvalidArgument); + + EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); } diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool_slim.cpp deleted file mode 100644 index dee95cbafe2..00000000000 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_item_bool_slim.cpp +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -using namespace executorch::backends::cuda; -using executorch::runtime::Error; - -namespace slim_c10 = executorch::backends::aoti::slim::c10; - -namespace { - -bool isCudaAvailable() { - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - return (err == cudaSuccess && device_count > 0); -} - -} // namespace - -class AOTITorchItemBoolSlimTest : public ::testing::Test { - protected: - void SetUp() override { - et_pal_init(); - } - - Tensor* createScalarBoolTensor( - bool value, - int32_t device_type = static_cast(slim_c10::DeviceType::CPU), - int32_t device_index = 0) { - Tensor* tensor = nullptr; - - std::vector sizes = {1}; - std::vector strides = {1}; - - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - static_cast(slim_c10::ScalarType::Bool), - device_type, - device_index, - &tensor); - - if (error != Error::Ok || tensor == nullptr) { - return nullptr; - } - - if (device_type == static_cast(slim_c10::DeviceType::CPU)) { - bool* data = static_cast(tensor->data_ptr()); - *data = value; - } else { - cudaMemcpy( - tensor->data_ptr(), &value, sizeof(bool), cudaMemcpyHostToDevice); - } - - return tensor; - } - - Tensor* createTestTensor( - const std::vector& sizes, - int32_t dtype = static_cast(slim_c10::ScalarType::Float), - int32_t device_type = static_cast(slim_c10::DeviceType::CPU), - int32_t device_index = 0) { - Tensor* tensor = nullptr; - - std::vector strides(sizes.size()); - if (!sizes.empty()) { - strides[sizes.size() - 1] = 1; - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - } - - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - strides.data(), - dtype, - device_type, - device_index, - &tensor); - - return (error == Error::Ok) ? tensor : nullptr; - } -}; - -// ============================================================================ -// Basic Functionality Tests -// ============================================================================ - -TEST_F(AOTITorchItemBoolSlimTest, TrueValue_CPU) { - Tensor* tensor = createScalarBoolTensor( - true, static_cast(slim_c10::DeviceType::CPU), 0); - ASSERT_NE(tensor, nullptr); - - bool result = false; - AOTITorchError error = aoti_torch_item_bool(tensor, &result); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(result, true); - - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); -} - -TEST_F(AOTITorchItemBoolSlimTest, FalseValue_CPU) { - Tensor* tensor = createScalarBoolTensor( - false, static_cast(slim_c10::DeviceType::CPU), 0); - ASSERT_NE(tensor, nullptr); - - bool result = true; - AOTITorchError error = aoti_torch_item_bool(tensor, &result); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(result, false); - - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); -} - -// ============================================================================ -// Error Handling Tests -// ============================================================================ - -TEST_F(AOTITorchItemBoolSlimTest, NullTensor) { - bool result = false; - AOTITorchError error = aoti_torch_item_bool(nullptr, &result); - - EXPECT_EQ(error, Error::InvalidArgument); -} - -TEST_F(AOTITorchItemBoolSlimTest, NullReturnValue) { - Tensor* tensor = createScalarBoolTensor( - true, static_cast(slim_c10::DeviceType::CPU), 0); - ASSERT_NE(tensor, nullptr); - - AOTITorchError error = aoti_torch_item_bool(tensor, nullptr); - - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); -} - -TEST_F(AOTITorchItemBoolSlimTest, MultiElementTensor) { - std::vector sizes = {2, 3}; - Tensor* tensor = createTestTensor( - sizes, - static_cast(slim_c10::ScalarType::Bool), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - EXPECT_GT(tensor->numel(), 1); - - bool result = false; - AOTITorchError error = aoti_torch_item_bool(tensor, &result); - - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); -} - -TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Float) { - std::vector sizes = {1}; - Tensor* tensor = createTestTensor( - sizes, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - - bool result = false; - AOTITorchError error = aoti_torch_item_bool(tensor, &result); - - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); -} - -TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Long) { - std::vector sizes = {1}; - Tensor* tensor = createTestTensor( - sizes, - static_cast(slim_c10::ScalarType::Long), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(tensor, nullptr); - - bool result = false; - AOTITorchError error = aoti_torch_item_bool(tensor, &result); - - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); -} - -// ============================================================================ -// CUDA Tests -// ============================================================================ - -TEST_F(AOTITorchItemBoolSlimTest, TrueValue_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - Tensor* tensor = createScalarBoolTensor( - true, static_cast(slim_c10::DeviceType::CUDA), 0); - ASSERT_NE(tensor, nullptr); - EXPECT_TRUE(tensor->is_cuda()); - - bool result = false; - AOTITorchError error = aoti_torch_item_bool(tensor, &result); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(result, true); - - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); -} - -TEST_F(AOTITorchItemBoolSlimTest, FalseValue_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - Tensor* tensor = createScalarBoolTensor( - false, static_cast(slim_c10::DeviceType::CUDA), 0); - ASSERT_NE(tensor, nullptr); - EXPECT_TRUE(tensor->is_cuda()); - - bool result = true; - AOTITorchError error = aoti_torch_item_bool(tensor, &result); - - EXPECT_EQ(error, Error::Ok); - EXPECT_EQ(result, false); - - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); -} - -TEST_F(AOTITorchItemBoolSlimTest, MultiElementTensor_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - Tensor* tensor = createTestTensor( - sizes, - static_cast(slim_c10::ScalarType::Bool), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(tensor, nullptr); - EXPECT_TRUE(tensor->is_cuda()); - - bool result = false; - AOTITorchError error = aoti_torch_item_bool(tensor, &result); - - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); -} - -TEST_F(AOTITorchItemBoolSlimTest, WrongDtype_Float_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {1}; - Tensor* tensor = createTestTensor( - sizes, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(tensor, nullptr); - - bool result = false; - AOTITorchError error = aoti_torch_item_bool(tensor, &result); - - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(tensor), Error::Ok); -} diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle.cpp index d123443cbfa..3a1de152f0b 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle.cpp @@ -7,64 +7,70 @@ */ #include -#include -#include -#include -#include -#include -#include #include #include -using namespace executorch::backends::aoti; +#include +#include +#include +#include +#include + using namespace executorch::backends::cuda; -using namespace executorch::runtime; -using executorch::runtime::etensor::Tensor; +using executorch::runtime::Error; -// Test fixture for aoti_torch_new_tensor_handle tests -class AOTITorchNewTensorHandleTest : public ::testing::Test { - protected: - void SetUp() override { - // Initialize ExecuTorch Platform Abstraction Layer - et_pal_init(); +namespace slim_c10 = executorch::backends::aoti::slim::c10; - // Check if CUDA is available - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - if (err != cudaSuccess || device_count == 0) { - GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; - } +namespace { + +bool isCudaAvailable() { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + return (err == cudaSuccess && device_count > 0); +} - // Clean up any existing cached metadata before each test - cleanup_tensor_metadata(); +std::vector calculateContiguousStrides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; + } + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + return strides; +} - // Clear any remaining tensors from previous tests - clear_all_tensors(); +} // namespace + +class AOTITorchNewTensorHandleSlimTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); } void TearDown() override { - // Clean up metadata - cleanup_tensor_metadata(); - - // Clear the global tensor storage using the provided function - clear_all_tensors(); + // SlimTensor uses automatic reference counting - no manual cleanup needed } - // Helper to create test tensors - Tensor* create_test_tensor( + Tensor* createTestTensor( const std::vector& sizes, const std::vector& strides = {}, - int32_t dtype = static_cast(SupportedDTypes::FLOAT32), - int32_t device_type = static_cast(SupportedDevices::CUDA), + int32_t dtype = static_cast(slim_c10::ScalarType::Float), + int32_t device_type = static_cast(slim_c10::DeviceType::CPU), int32_t device_index = 0) { - Tensor* tensor; + Tensor* tensor = nullptr; - const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data(); + std::vector effective_strides = strides; + if (strides.empty()) { + effective_strides = calculateContiguousStrides(sizes); + } AOTITorchError error = aoti_torch_empty_strided( sizes.size(), sizes.data(), - strides_ptr, + effective_strides.data(), dtype, device_type, device_index, @@ -74,97 +80,106 @@ class AOTITorchNewTensorHandleTest : public ::testing::Test { } }; -// Test basic functionality of creating a new tensor handle -TEST_F(AOTITorchNewTensorHandleTest, BasicFunctionality) { - // Create an original tensor +// ============================================================================ +// Basic Functionality Tests +// ============================================================================ + +TEST_F(AOTITorchNewTensorHandleSlimTest, BasicFunctionality_CPU) { std::vector sizes = {2, 3}; - Tensor* orig_tensor = create_test_tensor(sizes); + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); - // Create a new handle from the original tensor Tensor* new_tensor; AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); EXPECT_EQ(error, Error::Ok); EXPECT_NE(new_tensor, nullptr); - // Verify the new tensor has the same properties EXPECT_EQ(new_tensor->dim(), orig_tensor->dim()); EXPECT_EQ(new_tensor->size(0), orig_tensor->size(0)); EXPECT_EQ(new_tensor->size(1), orig_tensor->size(1)); EXPECT_EQ(new_tensor->numel(), orig_tensor->numel()); - // Verify they share the same memory - EXPECT_EQ(new_tensor->mutable_data_ptr(), orig_tensor->mutable_data_ptr()); + EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr()); - // Clean up EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); } -// Test creating new handle from null tensor -TEST_F(AOTITorchNewTensorHandleTest, NullOriginalTensor) { +TEST_F(AOTITorchNewTensorHandleSlimTest, NullOriginalTensor) { Tensor* new_tensor; AOTITorchError error = aoti_torch_new_tensor_handle(nullptr, &new_tensor); EXPECT_EQ(error, Error::InvalidArgument); } -// Test passing null pointer for new handle -TEST_F(AOTITorchNewTensorHandleTest, NullNewHandle) { +TEST_F(AOTITorchNewTensorHandleSlimTest, NullNewHandle) { std::vector sizes = {2, 3}; - Tensor* orig_tensor = create_test_tensor(sizes); + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, nullptr); EXPECT_EQ(error, Error::InvalidArgument); - // Clean up EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); } -// Test memory sharing between original and new tensor handle -TEST_F(AOTITorchNewTensorHandleTest, MemorySharing) { - // Create an original tensor +// ============================================================================ +// Memory Sharing Tests +// ============================================================================ + +TEST_F(AOTITorchNewTensorHandleSlimTest, MemorySharing_CPU) { std::vector sizes = {3, 4}; - Tensor* orig_tensor = create_test_tensor(sizes); + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); - // Get original memory pointer - void* orig_ptr = orig_tensor->mutable_data_ptr(); + void* orig_ptr = orig_tensor->data_ptr(); ASSERT_NE(orig_ptr, nullptr); - // Create a new handle Tensor* new_tensor; AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); EXPECT_EQ(error, Error::Ok); ASSERT_NE(new_tensor, nullptr); - // Verify both tensors point to the same memory - void* new_ptr = new_tensor->mutable_data_ptr(); + void* new_ptr = new_tensor->data_ptr(); EXPECT_EQ(orig_ptr, new_ptr); - // Clean up - deleting one should not affect the other's validity EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - // New tensor should still be valid and accessible - void* still_valid_ptr = new_tensor->mutable_data_ptr(); + void* still_valid_ptr = new_tensor->data_ptr(); EXPECT_EQ(still_valid_ptr, new_ptr); EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); } -// Test creating multiple handles from the same tensor -TEST_F(AOTITorchNewTensorHandleTest, MultipleHandles) { - // Create an original tensor +TEST_F(AOTITorchNewTensorHandleSlimTest, MultipleHandles_CPU) { std::vector sizes = {2, 3}; - Tensor* orig_tensor = create_test_tensor(sizes); + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); - void* orig_ptr = orig_tensor->mutable_data_ptr(); + void* orig_ptr = orig_tensor->data_ptr(); - // Create multiple handles std::vector handles; const int num_handles = 5; @@ -174,246 +189,165 @@ TEST_F(AOTITorchNewTensorHandleTest, MultipleHandles) { aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); EXPECT_EQ(error, Error::Ok); ASSERT_NE(new_tensor, nullptr); - EXPECT_EQ(new_tensor->mutable_data_ptr(), orig_ptr); + EXPECT_EQ(new_tensor->data_ptr(), orig_ptr); handles.push_back(new_tensor); } - // Delete original tensor EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - // All handles should still be valid for (Tensor* handle : handles) { - EXPECT_EQ(handle->mutable_data_ptr(), orig_ptr); + EXPECT_EQ(handle->data_ptr(), orig_ptr); EXPECT_EQ(handle->dim(), 2); EXPECT_EQ(handle->size(0), 2); EXPECT_EQ(handle->size(1), 3); } - // Delete all handles for (Tensor* handle : handles) { EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok); } } -// Test creating handle from tensor with custom strides -TEST_F(AOTITorchNewTensorHandleTest, CustomStrides) { +// ============================================================================ +// Tensor Property Tests +// ============================================================================ + +TEST_F(AOTITorchNewTensorHandleSlimTest, CustomStrides_CPU) { std::vector sizes = {3, 4}; std::vector strides = {4, 1}; // Row-major strides - Tensor* orig_tensor = create_test_tensor(sizes, strides); + Tensor* orig_tensor = createTestTensor( + sizes, + strides, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); - // Create new handle Tensor* new_tensor; AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); EXPECT_EQ(error, Error::Ok); ASSERT_NE(new_tensor, nullptr); - // Verify strides are preserved - int64_t* orig_strides_ptr; - int64_t* new_strides_ptr; - EXPECT_EQ(aoti_torch_get_strides(orig_tensor, &orig_strides_ptr), Error::Ok); - EXPECT_EQ(aoti_torch_get_strides(new_tensor, &new_strides_ptr), Error::Ok); + EXPECT_EQ(orig_tensor->stride(0), new_tensor->stride(0)); + EXPECT_EQ(orig_tensor->stride(1), new_tensor->stride(1)); - EXPECT_EQ(orig_strides_ptr[0], new_strides_ptr[0]); - EXPECT_EQ(orig_strides_ptr[1], new_strides_ptr[1]); - - // Clean up EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); } -// Test creating handle from bfloat16 tensor -TEST_F(AOTITorchNewTensorHandleTest, BFloat16Tensor) { +TEST_F(AOTITorchNewTensorHandleSlimTest, BFloat16Tensor_CPU) { std::vector sizes = {2, 3, 4}; - Tensor* orig_tensor = create_test_tensor( + Tensor* orig_tensor = createTestTensor( sizes, {}, - static_cast(SupportedDTypes::BFLOAT16), - static_cast(SupportedDevices::CUDA)); + static_cast(slim_c10::ScalarType::BFloat16), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); - // Verify original is bfloat16 - int32_t orig_dtype; - EXPECT_EQ(aoti_torch_get_dtype(orig_tensor, &orig_dtype), Error::Ok); - EXPECT_EQ(orig_dtype, static_cast(SupportedDTypes::BFLOAT16)); - - // Create new handle Tensor* new_tensor; AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); EXPECT_EQ(error, Error::Ok); ASSERT_NE(new_tensor, nullptr); - // Verify new tensor is also bfloat16 - int32_t new_dtype; - EXPECT_EQ(aoti_torch_get_dtype(new_tensor, &new_dtype), Error::Ok); - EXPECT_EQ(new_dtype, static_cast(SupportedDTypes::BFLOAT16)); - - // Verify element size (bfloat16 should be 2 bytes) - EXPECT_EQ(new_tensor->element_size(), 2); + EXPECT_EQ(new_tensor->itemsize(), 2); - // Clean up EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); } -// Test creating handle from scalar (0D) tensor -TEST_F(AOTITorchNewTensorHandleTest, ScalarTensor) { +TEST_F(AOTITorchNewTensorHandleSlimTest, ScalarTensor_CPU) { std::vector sizes = {}; - Tensor* orig_tensor = create_test_tensor(sizes); + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); EXPECT_EQ(orig_tensor->dim(), 0); - // Create new handle Tensor* new_tensor; AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); EXPECT_EQ(error, Error::Ok); ASSERT_NE(new_tensor, nullptr); - // Verify scalar properties EXPECT_EQ(new_tensor->dim(), 0); EXPECT_EQ(new_tensor->numel(), 1); - // Clean up EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); } -// Test creating handle from zero-sized tensor -TEST_F(AOTITorchNewTensorHandleTest, ZeroSizedTensor) { - std::vector sizes = {0, 5}; - Tensor* orig_tensor = create_test_tensor(sizes); - ASSERT_NE(orig_tensor, nullptr); - EXPECT_EQ(orig_tensor->numel(), 0); - - // Attempt to create new handle - should fail because zero-sized tensors have - // null data pointers - Tensor* new_tensor = nullptr; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - - // Zero-sized tensors are not currently supported - EXPECT_EQ(error, Error::InvalidArgument); - EXPECT_EQ(new_tensor, nullptr); - - // Clean up original tensor - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); -} - -// Test creating handle from large multi-dimensional tensor -TEST_F(AOTITorchNewTensorHandleTest, LargeMultiDimensionalTensor) { +TEST_F(AOTITorchNewTensorHandleSlimTest, LargeMultiDimensionalTensor_CPU) { std::vector sizes = {10, 20, 30}; - Tensor* orig_tensor = create_test_tensor(sizes); + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); - // Create new handle Tensor* new_tensor; AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); EXPECT_EQ(error, Error::Ok); ASSERT_NE(new_tensor, nullptr); - // Verify dimensions EXPECT_EQ(new_tensor->dim(), 3); EXPECT_EQ(new_tensor->size(0), 10); EXPECT_EQ(new_tensor->size(1), 20); EXPECT_EQ(new_tensor->size(2), 30); EXPECT_EQ(new_tensor->numel(), 6000); - // Clean up EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); } -// Test creating handle preserves tensor metadata -TEST_F(AOTITorchNewTensorHandleTest, MetadataPreservation) { - std::vector sizes = {2, 3, 4}; - std::vector strides = {12, 4, 1}; - Tensor* orig_tensor = create_test_tensor( - sizes, - strides, - static_cast(SupportedDTypes::FLOAT32), - static_cast(SupportedDevices::CUDA)); - ASSERT_NE(orig_tensor, nullptr); - - // Create new handle - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - - // Get and compare all metadata - int64_t* orig_sizes_ptr; - int64_t* new_sizes_ptr; - int64_t* orig_strides_ptr; - int64_t* new_strides_ptr; - int32_t orig_dtype, new_dtype; - int32_t orig_device_type, new_device_type; - int32_t orig_device_index, new_device_index; - - EXPECT_EQ(aoti_torch_get_sizes(orig_tensor, &orig_sizes_ptr), Error::Ok); - EXPECT_EQ(aoti_torch_get_sizes(new_tensor, &new_sizes_ptr), Error::Ok); - EXPECT_EQ(aoti_torch_get_strides(orig_tensor, &orig_strides_ptr), Error::Ok); - EXPECT_EQ(aoti_torch_get_strides(new_tensor, &new_strides_ptr), Error::Ok); - EXPECT_EQ(aoti_torch_get_dtype(orig_tensor, &orig_dtype), Error::Ok); - EXPECT_EQ(aoti_torch_get_dtype(new_tensor, &new_dtype), Error::Ok); - EXPECT_EQ( - aoti_torch_get_device_type(orig_tensor, &orig_device_type), Error::Ok); - EXPECT_EQ( - aoti_torch_get_device_type(new_tensor, &new_device_type), Error::Ok); - EXPECT_EQ( - aoti_torch_get_device_index(orig_tensor, &orig_device_index), Error::Ok); - EXPECT_EQ( - aoti_torch_get_device_index(new_tensor, &new_device_index), Error::Ok); - - // Verify all metadata matches - for (int i = 0; i < 3; i++) { - EXPECT_EQ(orig_sizes_ptr[i], new_sizes_ptr[i]); - EXPECT_EQ(orig_strides_ptr[i], new_strides_ptr[i]); - } - EXPECT_EQ(orig_dtype, new_dtype); - EXPECT_EQ(orig_device_type, new_device_type); - EXPECT_EQ(orig_device_index, new_device_index); +// ============================================================================ +// Handle Chain Tests +// ============================================================================ - // Clean up - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); -} - -// Test creating handle chain: orig -> handle1 -> handle2 -TEST_F(AOTITorchNewTensorHandleTest, HandleChain) { +TEST_F(AOTITorchNewTensorHandleSlimTest, HandleChain_CPU) { std::vector sizes = {2, 3}; - Tensor* orig_tensor = create_test_tensor(sizes); + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); - void* orig_ptr = orig_tensor->mutable_data_ptr(); + void* orig_ptr = orig_tensor->data_ptr(); - // Create first handle Tensor* handle1; AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &handle1); EXPECT_EQ(error, Error::Ok); ASSERT_NE(handle1, nullptr); - EXPECT_EQ(handle1->mutable_data_ptr(), orig_ptr); + EXPECT_EQ(handle1->data_ptr(), orig_ptr); - // Create second handle from the first handle Tensor* handle2; error = aoti_torch_new_tensor_handle(handle1, &handle2); EXPECT_EQ(error, Error::Ok); ASSERT_NE(handle2, nullptr); - EXPECT_EQ(handle2->mutable_data_ptr(), orig_ptr); + EXPECT_EQ(handle2->data_ptr(), orig_ptr); - // Delete in reverse order EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok); EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok); EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); } -// Test creating handle and verifying reference counting -TEST_F(AOTITorchNewTensorHandleTest, ReferenceCountingTest) { +TEST_F(AOTITorchNewTensorHandleSlimTest, ReferenceCountingTest_CPU) { std::vector sizes = {2, 3}; - Tensor* orig_tensor = create_test_tensor(sizes); + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); - void* orig_ptr = orig_tensor->mutable_data_ptr(); + void* orig_ptr = orig_tensor->data_ptr(); - // Create multiple handles Tensor* handle1; Tensor* handle2; Tensor* handle3; @@ -422,116 +356,276 @@ TEST_F(AOTITorchNewTensorHandleTest, ReferenceCountingTest) { EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle2), Error::Ok); EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle3), Error::Ok); - // Delete original EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - // All handles should still be valid - EXPECT_EQ(handle1->mutable_data_ptr(), orig_ptr); - EXPECT_EQ(handle2->mutable_data_ptr(), orig_ptr); - EXPECT_EQ(handle3->mutable_data_ptr(), orig_ptr); + EXPECT_EQ(handle1->data_ptr(), orig_ptr); + EXPECT_EQ(handle2->data_ptr(), orig_ptr); + EXPECT_EQ(handle3->data_ptr(), orig_ptr); - // Delete handles one by one EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok); - // Remaining handles should still be valid - EXPECT_EQ(handle2->mutable_data_ptr(), orig_ptr); - EXPECT_EQ(handle3->mutable_data_ptr(), orig_ptr); + EXPECT_EQ(handle2->data_ptr(), orig_ptr); + EXPECT_EQ(handle3->data_ptr(), orig_ptr); EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok); - // Last handle should still be valid - EXPECT_EQ(handle3->mutable_data_ptr(), orig_ptr); + EXPECT_EQ(handle3->data_ptr(), orig_ptr); EXPECT_EQ(aoti_torch_delete_tensor_object(handle3), Error::Ok); } -// Test creating handle from int32 tensor -TEST_F(AOTITorchNewTensorHandleTest, Int32Tensor) { +// ============================================================================ +// Different Dtype Tests +// ============================================================================ + +TEST_F(AOTITorchNewTensorHandleSlimTest, Int64Tensor_CPU) { std::vector sizes = {2, 3}; - Tensor* orig_tensor = create_test_tensor( + Tensor* orig_tensor = createTestTensor( sizes, {}, - 3, // int32 - static_cast(SupportedDevices::CUDA)); + static_cast(slim_c10::ScalarType::Long), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); - // Create new handle Tensor* new_tensor; AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); EXPECT_EQ(error, Error::Ok); ASSERT_NE(new_tensor, nullptr); - // Verify dtype - int32_t new_dtype; - EXPECT_EQ(aoti_torch_get_dtype(new_tensor, &new_dtype), Error::Ok); - EXPECT_EQ(new_dtype, 3); // int32 + EXPECT_EQ(new_tensor->itemsize(), 8); - // Clean up EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); } -// Test creating handle with incontiguous tensor (transpose-like layout) -TEST_F(AOTITorchNewTensorHandleTest, IncontiguousTransposeLayout) { +TEST_F(AOTITorchNewTensorHandleSlimTest, IncontiguousLayout_CPU) { std::vector sizes = {3, 4}; std::vector strides = {1, 3}; // Column-major (incontiguous) - Tensor* orig_tensor = create_test_tensor(sizes, strides); + Tensor* orig_tensor = createTestTensor( + sizes, + strides, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); - // Create new handle Tensor* new_tensor; AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); EXPECT_EQ(error, Error::Ok); ASSERT_NE(new_tensor, nullptr); - // Verify strides are preserved - int64_t* new_strides_ptr; - EXPECT_EQ(aoti_torch_get_strides(new_tensor, &new_strides_ptr), Error::Ok); - EXPECT_EQ(new_strides_ptr[0], 1); - EXPECT_EQ(new_strides_ptr[1], 3); + EXPECT_EQ(new_tensor->stride(0), 1); + EXPECT_EQ(new_tensor->stride(1), 3); - // Verify both tensors share the same memory - EXPECT_EQ(new_tensor->mutable_data_ptr(), orig_tensor->mutable_data_ptr()); + EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr()); - // Clean up EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); } -// Test creating handle with expanded strides (broadcasted dimension) -TEST_F(AOTITorchNewTensorHandleTest, ExpandedStrides) { - std::vector sizes = {2, 3, 4}; - std::vector strides = {0, 4, 1}; // First dimension has stride 0 - Tensor* orig_tensor = create_test_tensor(sizes, strides); +// ============================================================================ +// CUDA Tests +// ============================================================================ + +TEST_F(AOTITorchNewTensorHandleSlimTest, BasicFunctionality_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {2, 3}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); ASSERT_NE(orig_tensor, nullptr); + EXPECT_TRUE(orig_tensor->is_cuda()); + + Tensor* new_tensor; + AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(new_tensor, nullptr); + EXPECT_TRUE(new_tensor->is_cuda()); + + EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr()); + + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); +} + +TEST_F(AOTITorchNewTensorHandleSlimTest, MemorySharing_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {3, 4}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(orig_tensor, nullptr); + + void* orig_ptr = orig_tensor->data_ptr(); + ASSERT_NE(orig_ptr, nullptr); - // Create new handle Tensor* new_tensor; AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); EXPECT_EQ(error, Error::Ok); ASSERT_NE(new_tensor, nullptr); - // Verify expanded strides are preserved - int64_t* new_strides_ptr; - EXPECT_EQ(aoti_torch_get_strides(new_tensor, &new_strides_ptr), Error::Ok); - EXPECT_EQ(new_strides_ptr[0], 0); - EXPECT_EQ(new_strides_ptr[1], 4); - EXPECT_EQ(new_strides_ptr[2], 1); + void* new_ptr = new_tensor->data_ptr(); + EXPECT_EQ(orig_ptr, new_ptr); - // Clean up EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + + void* still_valid_ptr = new_tensor->data_ptr(); + EXPECT_EQ(still_valid_ptr, new_ptr); + EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); } -// Stress test: create many handles -TEST_F(AOTITorchNewTensorHandleTest, StressTestManyHandles) { +TEST_F(AOTITorchNewTensorHandleSlimTest, MultipleHandles_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + std::vector sizes = {2, 3}; - Tensor* orig_tensor = create_test_tensor(sizes); + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(orig_tensor, nullptr); + + void* orig_ptr = orig_tensor->data_ptr(); + + std::vector handles; + const int num_handles = 5; + + for (int i = 0; i < num_handles; i++) { + Tensor* new_tensor; + AOTITorchError error = + aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(new_tensor, nullptr); + EXPECT_EQ(new_tensor->data_ptr(), orig_ptr); + handles.push_back(new_tensor); + } + + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + + for (Tensor* handle : handles) { + EXPECT_EQ(handle->data_ptr(), orig_ptr); + EXPECT_TRUE(handle->is_cuda()); + } + + for (Tensor* handle : handles) { + EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok); + } +} + +TEST_F(AOTITorchNewTensorHandleSlimTest, ReferenceCountingTest_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {2, 3}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(orig_tensor, nullptr); + + void* orig_ptr = orig_tensor->data_ptr(); + + Tensor* handle1; + Tensor* handle2; + Tensor* handle3; + + EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle1), Error::Ok); + EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle2), Error::Ok); + EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle3), Error::Ok); + + EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); + + EXPECT_EQ(handle1->data_ptr(), orig_ptr); + EXPECT_EQ(handle2->data_ptr(), orig_ptr); + EXPECT_EQ(handle3->data_ptr(), orig_ptr); + + EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(handle3), Error::Ok); +} + +// ============================================================================ +// Mixed Device Tests +// ============================================================================ + +TEST_F(AOTITorchNewTensorHandleSlimTest, MixedDeviceHandles) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {2, 3}; + + Tensor* cpu_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(cpu_tensor, nullptr); + EXPECT_TRUE(cpu_tensor->is_cpu()); + + Tensor* cuda_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(cuda_tensor, nullptr); + EXPECT_TRUE(cuda_tensor->is_cuda()); + + Tensor* cpu_handle; + Tensor* cuda_handle; + + EXPECT_EQ(aoti_torch_new_tensor_handle(cpu_tensor, &cpu_handle), Error::Ok); + EXPECT_EQ(aoti_torch_new_tensor_handle(cuda_tensor, &cuda_handle), Error::Ok); + + EXPECT_TRUE(cpu_handle->is_cpu()); + EXPECT_TRUE(cuda_handle->is_cuda()); + EXPECT_NE(cpu_handle->data_ptr(), cuda_handle->data_ptr()); + + EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_handle), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_handle), Error::Ok); +} + +// ============================================================================ +// Stress Tests +// ============================================================================ + +TEST_F(AOTITorchNewTensorHandleSlimTest, StressTestManyHandles_CPU) { + std::vector sizes = {2, 3}; + Tensor* orig_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); ASSERT_NE(orig_tensor, nullptr); - void* orig_ptr = orig_tensor->mutable_data_ptr(); + void* orig_ptr = orig_tensor->data_ptr(); - // Create many handles const int num_handles = 100; std::vector handles; @@ -541,19 +635,16 @@ TEST_F(AOTITorchNewTensorHandleTest, StressTestManyHandles) { aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); EXPECT_EQ(error, Error::Ok); ASSERT_NE(new_tensor, nullptr); - EXPECT_EQ(new_tensor->mutable_data_ptr(), orig_ptr); + EXPECT_EQ(new_tensor->data_ptr(), orig_ptr); handles.push_back(new_tensor); } - // Delete original EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - // All handles should still be valid for (Tensor* handle : handles) { - EXPECT_EQ(handle->mutable_data_ptr(), orig_ptr); + EXPECT_EQ(handle->data_ptr(), orig_ptr); } - // Delete all handles for (Tensor* handle : handles) { EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok); } diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle_slim.cpp deleted file mode 100644 index 3a1de152f0b..00000000000 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_new_tensor_handle_slim.cpp +++ /dev/null @@ -1,651 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -using namespace executorch::backends::cuda; -using executorch::runtime::Error; - -namespace slim_c10 = executorch::backends::aoti::slim::c10; - -namespace { - -bool isCudaAvailable() { - int device_count = 0; - cudaError_t err = cudaGetDeviceCount(&device_count); - return (err == cudaSuccess && device_count > 0); -} - -std::vector calculateContiguousStrides( - const std::vector& sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) { - return strides; - } - strides[sizes.size() - 1] = 1; - for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * sizes[i + 1]; - } - return strides; -} - -} // namespace - -class AOTITorchNewTensorHandleSlimTest : public ::testing::Test { - protected: - void SetUp() override { - et_pal_init(); - } - - void TearDown() override { - // SlimTensor uses automatic reference counting - no manual cleanup needed - } - - Tensor* createTestTensor( - const std::vector& sizes, - const std::vector& strides = {}, - int32_t dtype = static_cast(slim_c10::ScalarType::Float), - int32_t device_type = static_cast(slim_c10::DeviceType::CPU), - int32_t device_index = 0) { - Tensor* tensor = nullptr; - - std::vector effective_strides = strides; - if (strides.empty()) { - effective_strides = calculateContiguousStrides(sizes); - } - - AOTITorchError error = aoti_torch_empty_strided( - sizes.size(), - sizes.data(), - effective_strides.data(), - dtype, - device_type, - device_index, - &tensor); - - return (error == Error::Ok) ? tensor : nullptr; - } -}; - -// ============================================================================ -// Basic Functionality Tests -// ============================================================================ - -TEST_F(AOTITorchNewTensorHandleSlimTest, BasicFunctionality_CPU) { - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(new_tensor, nullptr); - - EXPECT_EQ(new_tensor->dim(), orig_tensor->dim()); - EXPECT_EQ(new_tensor->size(0), orig_tensor->size(0)); - EXPECT_EQ(new_tensor->size(1), orig_tensor->size(1)); - EXPECT_EQ(new_tensor->numel(), orig_tensor->numel()); - - EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr()); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); -} - -TEST_F(AOTITorchNewTensorHandleSlimTest, NullOriginalTensor) { - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(nullptr, &new_tensor); - - EXPECT_EQ(error, Error::InvalidArgument); -} - -TEST_F(AOTITorchNewTensorHandleSlimTest, NullNewHandle) { - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, nullptr); - - EXPECT_EQ(error, Error::InvalidArgument); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); -} - -// ============================================================================ -// Memory Sharing Tests -// ============================================================================ - -TEST_F(AOTITorchNewTensorHandleSlimTest, MemorySharing_CPU) { - std::vector sizes = {3, 4}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - ASSERT_NE(orig_ptr, nullptr); - - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - - void* new_ptr = new_tensor->data_ptr(); - EXPECT_EQ(orig_ptr, new_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - - void* still_valid_ptr = new_tensor->data_ptr(); - EXPECT_EQ(still_valid_ptr, new_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); -} - -TEST_F(AOTITorchNewTensorHandleSlimTest, MultipleHandles_CPU) { - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - - std::vector handles; - const int num_handles = 5; - - for (int i = 0; i < num_handles; i++) { - Tensor* new_tensor; - AOTITorchError error = - aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - EXPECT_EQ(new_tensor->data_ptr(), orig_ptr); - handles.push_back(new_tensor); - } - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - - for (Tensor* handle : handles) { - EXPECT_EQ(handle->data_ptr(), orig_ptr); - EXPECT_EQ(handle->dim(), 2); - EXPECT_EQ(handle->size(0), 2); - EXPECT_EQ(handle->size(1), 3); - } - - for (Tensor* handle : handles) { - EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok); - } -} - -// ============================================================================ -// Tensor Property Tests -// ============================================================================ - -TEST_F(AOTITorchNewTensorHandleSlimTest, CustomStrides_CPU) { - std::vector sizes = {3, 4}; - std::vector strides = {4, 1}; // Row-major strides - Tensor* orig_tensor = createTestTensor( - sizes, - strides, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - - EXPECT_EQ(orig_tensor->stride(0), new_tensor->stride(0)); - EXPECT_EQ(orig_tensor->stride(1), new_tensor->stride(1)); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); -} - -TEST_F(AOTITorchNewTensorHandleSlimTest, BFloat16Tensor_CPU) { - std::vector sizes = {2, 3, 4}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::BFloat16), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - - EXPECT_EQ(new_tensor->itemsize(), 2); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); -} - -TEST_F(AOTITorchNewTensorHandleSlimTest, ScalarTensor_CPU) { - std::vector sizes = {}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - EXPECT_EQ(orig_tensor->dim(), 0); - - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - - EXPECT_EQ(new_tensor->dim(), 0); - EXPECT_EQ(new_tensor->numel(), 1); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); -} - -TEST_F(AOTITorchNewTensorHandleSlimTest, LargeMultiDimensionalTensor_CPU) { - std::vector sizes = {10, 20, 30}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - - EXPECT_EQ(new_tensor->dim(), 3); - EXPECT_EQ(new_tensor->size(0), 10); - EXPECT_EQ(new_tensor->size(1), 20); - EXPECT_EQ(new_tensor->size(2), 30); - EXPECT_EQ(new_tensor->numel(), 6000); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); -} - -// ============================================================================ -// Handle Chain Tests -// ============================================================================ - -TEST_F(AOTITorchNewTensorHandleSlimTest, HandleChain_CPU) { - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - - Tensor* handle1; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &handle1); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(handle1, nullptr); - EXPECT_EQ(handle1->data_ptr(), orig_ptr); - - Tensor* handle2; - error = aoti_torch_new_tensor_handle(handle1, &handle2); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(handle2, nullptr); - EXPECT_EQ(handle2->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); -} - -TEST_F(AOTITorchNewTensorHandleSlimTest, ReferenceCountingTest_CPU) { - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - - Tensor* handle1; - Tensor* handle2; - Tensor* handle3; - - EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle1), Error::Ok); - EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle2), Error::Ok); - EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle3), Error::Ok); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - - EXPECT_EQ(handle1->data_ptr(), orig_ptr); - EXPECT_EQ(handle2->data_ptr(), orig_ptr); - EXPECT_EQ(handle3->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok); - - EXPECT_EQ(handle2->data_ptr(), orig_ptr); - EXPECT_EQ(handle3->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok); - - EXPECT_EQ(handle3->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(handle3), Error::Ok); -} - -// ============================================================================ -// Different Dtype Tests -// ============================================================================ - -TEST_F(AOTITorchNewTensorHandleSlimTest, Int64Tensor_CPU) { - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Long), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - - EXPECT_EQ(new_tensor->itemsize(), 8); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); -} - -TEST_F(AOTITorchNewTensorHandleSlimTest, IncontiguousLayout_CPU) { - std::vector sizes = {3, 4}; - std::vector strides = {1, 3}; // Column-major (incontiguous) - Tensor* orig_tensor = createTestTensor( - sizes, - strides, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - - EXPECT_EQ(new_tensor->stride(0), 1); - EXPECT_EQ(new_tensor->stride(1), 3); - - EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr()); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); -} - -// ============================================================================ -// CUDA Tests -// ============================================================================ - -TEST_F(AOTITorchNewTensorHandleSlimTest, BasicFunctionality_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(orig_tensor, nullptr); - EXPECT_TRUE(orig_tensor->is_cuda()); - - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - - EXPECT_EQ(error, Error::Ok); - EXPECT_NE(new_tensor, nullptr); - EXPECT_TRUE(new_tensor->is_cuda()); - - EXPECT_EQ(new_tensor->data_ptr(), orig_tensor->data_ptr()); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); -} - -TEST_F(AOTITorchNewTensorHandleSlimTest, MemorySharing_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {3, 4}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - ASSERT_NE(orig_ptr, nullptr); - - Tensor* new_tensor; - AOTITorchError error = aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - - void* new_ptr = new_tensor->data_ptr(); - EXPECT_EQ(orig_ptr, new_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - - void* still_valid_ptr = new_tensor->data_ptr(); - EXPECT_EQ(still_valid_ptr, new_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(new_tensor), Error::Ok); -} - -TEST_F(AOTITorchNewTensorHandleSlimTest, MultipleHandles_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - - std::vector handles; - const int num_handles = 5; - - for (int i = 0; i < num_handles; i++) { - Tensor* new_tensor; - AOTITorchError error = - aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - EXPECT_EQ(new_tensor->data_ptr(), orig_ptr); - handles.push_back(new_tensor); - } - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - - for (Tensor* handle : handles) { - EXPECT_EQ(handle->data_ptr(), orig_ptr); - EXPECT_TRUE(handle->is_cuda()); - } - - for (Tensor* handle : handles) { - EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok); - } -} - -TEST_F(AOTITorchNewTensorHandleSlimTest, ReferenceCountingTest_CUDA) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - - Tensor* handle1; - Tensor* handle2; - Tensor* handle3; - - EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle1), Error::Ok); - EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle2), Error::Ok); - EXPECT_EQ(aoti_torch_new_tensor_handle(orig_tensor, &handle3), Error::Ok); - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - - EXPECT_EQ(handle1->data_ptr(), orig_ptr); - EXPECT_EQ(handle2->data_ptr(), orig_ptr); - EXPECT_EQ(handle3->data_ptr(), orig_ptr); - - EXPECT_EQ(aoti_torch_delete_tensor_object(handle1), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(handle2), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(handle3), Error::Ok); -} - -// ============================================================================ -// Mixed Device Tests -// ============================================================================ - -TEST_F(AOTITorchNewTensorHandleSlimTest, MixedDeviceHandles) { - if (!isCudaAvailable()) { - GTEST_SKIP() << "CUDA not available"; - } - - std::vector sizes = {2, 3}; - - Tensor* cpu_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(cpu_tensor, nullptr); - EXPECT_TRUE(cpu_tensor->is_cpu()); - - Tensor* cuda_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CUDA), - 0); - ASSERT_NE(cuda_tensor, nullptr); - EXPECT_TRUE(cuda_tensor->is_cuda()); - - Tensor* cpu_handle; - Tensor* cuda_handle; - - EXPECT_EQ(aoti_torch_new_tensor_handle(cpu_tensor, &cpu_handle), Error::Ok); - EXPECT_EQ(aoti_torch_new_tensor_handle(cuda_tensor, &cuda_handle), Error::Ok); - - EXPECT_TRUE(cpu_handle->is_cpu()); - EXPECT_TRUE(cuda_handle->is_cuda()); - EXPECT_NE(cpu_handle->data_ptr(), cuda_handle->data_ptr()); - - EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_tensor), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_handle), Error::Ok); - EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_handle), Error::Ok); -} - -// ============================================================================ -// Stress Tests -// ============================================================================ - -TEST_F(AOTITorchNewTensorHandleSlimTest, StressTestManyHandles_CPU) { - std::vector sizes = {2, 3}; - Tensor* orig_tensor = createTestTensor( - sizes, - {}, - static_cast(slim_c10::ScalarType::Float), - static_cast(slim_c10::DeviceType::CPU), - 0); - ASSERT_NE(orig_tensor, nullptr); - - void* orig_ptr = orig_tensor->data_ptr(); - - const int num_handles = 100; - std::vector handles; - - for (int i = 0; i < num_handles; i++) { - Tensor* new_tensor; - AOTITorchError error = - aoti_torch_new_tensor_handle(orig_tensor, &new_tensor); - EXPECT_EQ(error, Error::Ok); - ASSERT_NE(new_tensor, nullptr); - EXPECT_EQ(new_tensor->data_ptr(), orig_ptr); - handles.push_back(new_tensor); - } - - EXPECT_EQ(aoti_torch_delete_tensor_object(orig_tensor), Error::Ok); - - for (Tensor* handle : handles) { - EXPECT_EQ(handle->data_ptr(), orig_ptr); - } - - for (Tensor* handle : handles) { - EXPECT_EQ(aoti_torch_delete_tensor_object(handle), Error::Ok); - } -} diff --git a/backends/cuda/runtime/utils.h b/backends/cuda/runtime/utils.h index 4474f8cf57e..8517ec21af6 100644 --- a/backends/cuda/runtime/utils.h +++ b/backends/cuda/runtime/utils.h @@ -31,6 +31,7 @@ } while (0) // CUDA error checking macro (without return, for use in void functions) +#ifndef ET_CUDA_CHECK #define ET_CUDA_CHECK(EXPR) \ do { \ const cudaError_t err = EXPR; \ @@ -45,6 +46,7 @@ cudaGetErrorString(err)); \ ET_CHECK_MSG(false, "CUDA error: %s", cudaGetErrorString(err)); \ } while (0) +#endif // Kernel launch check macro (with return) #define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \