diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 8b94351d469..1db8792e0c0 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -34,6 +34,39 @@ find_package(CUDAToolkit REQUIRED) include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) find_package_torch() +# CUDA tensor maker for backends that support incontiguous tensors +set(_tensor_maker_sources runtime/tensor/tensor_maker.cpp) +add_library(cuda_tensor_maker STATIC ${_tensor_maker_sources}) +target_include_directories( + cuda_tensor_maker + PUBLIC $ $ + $ +) +target_compile_options( + cuda_tensor_maker + PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> +) +# Ensure symbols are exported properly +if(APPLE) + target_link_options(cuda_tensor_maker PUBLIC -Wl,-export_dynamic) +else() + target_link_options( + cuda_tensor_maker PUBLIC + $<$>:-Wl,--export-dynamic> + ) +endif() + +# Link against ExecuTorch core libraries +target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS}) +executorch_target_link_options_shared_lib(cuda_tensor_maker) + +install( + TARGETS cuda_tensor_maker + EXPORT ExecuTorchTargets + DESTINATION lib +) + # CUDA-specific AOTI functionality set(_aoti_cuda_sources runtime/cuda_backend.cpp @@ -62,9 +95,10 @@ target_link_options( aoti_cuda PUBLIC $<$>:-Wl,--export-dynamic> ) -# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries +# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch +# CUDA libraries target_link_libraries( - aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS} + aoti_cuda PUBLIC aoti_common cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} ) # If you need other CUDA libraries, link them similarly: # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...) diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS index d18c0118542..a85f3a7e6a3 100644 --- a/backends/cuda/runtime/TARGETS +++ b/backends/cuda/runtime/TARGETS @@ -27,6 +27,25 @@ runtime.cxx_library( ], ) +runtime.cxx_library( + name = "tensor_maker", + srcs = [ + "tensor/tensor_maker.cpp", + ], + headers = [ + "tensor/tensor_maker.h", + ], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "//executorch/runtime/core:core", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/core/exec_aten/util:tensor_util", + ], +) + runtime.cxx_library( name = "runtime_shims", srcs = [ @@ -52,8 +71,8 @@ runtime.cxx_library( compiler_flags = ["-Wno-global-constructors"], visibility = ["@EXECUTORCH_CLIENTS"], deps = [ + ":tensor_maker", "//executorch/backends/aoti:common_shims", - "//executorch/extension/tensor:tensor", "//executorch/runtime/core:core", "//executorch/runtime/core/exec_aten:lib", "//executorch/runtime/platform:platform", diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp index 5d30d3124d9..46b8d448a3a 100644 --- a/backends/cuda/runtime/shims/memory.cpp +++ b/backends/cuda/runtime/shims/memory.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -163,9 +164,11 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( // Create ExecutorTorch tensor that wraps the existing memory // Note: We're NOT copying the data, just wrapping it - auto tensor = executorch::extension::from_blob( - data, // existing memory (don't copy!) + // Using CUDA-specific tensor maker that supports incontiguous tensors + auto tensor = make_tensor( sizes, // tensor dimensions + data, // existing memory (don't copy!) + {}, // dim_order (empty, will be auto-generated) strides, // tensor strides (allows different strides) dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType ); @@ -210,10 +213,6 @@ AOTITorchError aoti_torch_empty_strided( // This requires us to reserve CUDA memory and put it into a ETensor void* ptr; - int64_t numel = 1; - for (int64_t i = 0; i < ndim; i++) { - numel *= sizes_ptr[i]; - } ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype)); @@ -223,7 +222,28 @@ AOTITorchError aoti_torch_empty_strided( InvalidArgument, "Invalid element size for dtype: %d", dtype); - int64_t nbytes = numel * element_size; + + // Calculate storage size based on strides, matching PyTorch's behavior + // This is critical when sizes and strides don't match the expected contiguous + // layout Reference: PyTorch's computeStorageNbytes in EmptyTensor.cpp + int64_t storage_size = 1; // storage offset (0) + 1 + for (int64_t i = 0; i < ndim; i++) { + if (sizes_ptr[i] == 0) { + storage_size = 0; + break; + } + // For each dimension, add stride[i] * (size[i] - 1) + // This gives us the maximum offset in that dimension + int64_t stride_i = (strides_ptr != nullptr) ? strides_ptr[i] : 1; + if (strides_ptr == nullptr) { + // Calculate contiguous stride if not provided + for (int64_t j = i + 1; j < ndim; j++) { + stride_i *= sizes_ptr[j]; + } + } + storage_size += stride_i * (sizes_ptr[i] - 1); + } + int64_t nbytes = storage_size * element_size; if (device_type == static_cast(SupportedDevices::CUDA)) { ET_CUDA_CHECK_OR_RETURN_ERROR( @@ -250,8 +270,13 @@ AOTITorchError aoti_torch_empty_strided( auto strides = convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); // ETensor creation with dynamic shape support for edge cases - auto tensor = executorch::extension::from_blob( - ptr, sizes, strides, dtype_to_scalar_type(dtype)); + // Using CUDA-specific tensor maker that supports incontiguous tensors + auto tensor = make_tensor( + sizes, + ptr, + {}, // dim_order (empty, will be auto-generated) + strides, + dtype_to_scalar_type(dtype)); // Store the tensor so it doesn't get destroyed tensors.insert(tensor); @@ -259,7 +284,6 @@ AOTITorchError aoti_torch_empty_strided( // This tensor owns the memory it allocated, set reference count to 1 memory_to_n_tensor[ptr] = 1; - return Error::Ok; } @@ -630,9 +654,11 @@ AOTITorchError aoti_torch__reinterpret_tensor( // Create new tensor view that reinterprets the same memory with different // shape/strides This creates a view, not a copy - the data pointer is shared - std::shared_ptr tensor = executorch::extension::from_blob( - data_ptr, // Reuse the same memory from source tensor + // Using CUDA-specific tensor maker that supports incontiguous tensors + std::shared_ptr tensor = make_tensor( sizes, // New sizes with explicit SizesType + data_ptr, // Reuse the same memory from source tensor + {}, // dim_order (empty, will be auto-generated) strides, // New strides with explicit StridesType dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting ); diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h index 15a4e397d24..6b61b5bd3b8 100644 --- a/backends/cuda/runtime/shims/tensor_attribute.h +++ b/backends/cuda/runtime/shims/tensor_attribute.h @@ -8,8 +8,8 @@ #pragma once -#include #include +#include #include namespace executorch::backends::cuda { diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp index 1cefca99c2a..d3044810b15 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp index da65129f18a..799a8d1221b 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp @@ -278,30 +278,6 @@ TEST_F(AOTITorchEmptyStridedTest, LargeTensor) { EXPECT_EQ(tensor->size(2), 50); } -// Test error handling with memory allocation failures -TEST_F(AOTITorchEmptyStridedTest, MemoryAllocationStress) { - // Try to create a very large tensor that might cause allocation failure - // (This test may pass or fail depending on available memory) - std::vector huge_sizes = {10000, 10000, 100}; // ~38GB for float32 - Tensor* tensor; - - AOTITorchError error = aoti_torch_empty_strided( - huge_sizes.size(), - huge_sizes.data(), - nullptr, - 6, // float32 - 1, // CUDA device - 0, // device index - &tensor); - - // Either succeed or fail with memory allocation error - if (error == Error::Ok) { - EXPECT_NE(tensor, nullptr); - } else { - EXPECT_EQ(error, Error::MemoryAllocationFailed); - } -} - // Test aoti_torch_empty_strided with bfloat16 dtype TEST_F(AOTITorchEmptyStridedTest, BFloat16Tensor) { // Test creating bfloat16 tensor on CUDA @@ -509,11 +485,11 @@ TEST_F(AOTITorchEmptyStridedTest, ZeroElementTensor) { EXPECT_EQ(sizes_ptr[2], 3); } -// Test different data types (only float32 is currently supported) +// Test different data types (currently we support bf16, fp32 and int32) TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) { std::vector sizes = {2, 3}; - // Test float32 (dtype 6) - currently the only supported type + // Test float32 (dtype 6) - one of the supported types Tensor* tensor_float32; AOTITorchError error = aoti_torch_empty_strided( sizes.size(), @@ -527,7 +503,7 @@ TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) { EXPECT_EQ(error, Error::Ok); EXPECT_NE(tensor_float32, nullptr); - // Test unsupported data types should return error + // Test int32 (dtype 3) - one of the supported types Tensor* tensor_int32; error = aoti_torch_empty_strided( sizes.size(), @@ -538,7 +514,8 @@ TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) { 0, // device index &tensor_int32); - EXPECT_EQ(error, Error::InvalidArgument); // Should fail for unsupported dtype + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_int32, nullptr); // Test another unsupported data type Tensor* tensor_float64; @@ -586,3 +563,105 @@ TEST_F(AOTITorchEmptyStridedTest, MultiDimensionalTensors) { EXPECT_EQ(tensor_5d->size(3), 4); EXPECT_EQ(tensor_5d->size(4), 5); } + +// Test incontiguous tensor creation - transpose-like layout +TEST_F(AOTITorchEmptyStridedTest, IncontiguousTransposeLayout) { + // Create a tensor with transpose-like strides (column-major) + // For a 3x4 tensor in column-major order, strides should be [1, 3] + // This means each row step is 1, and each column step is 3 + std::vector sizes = {3, 4}; + std::vector strides = {1, 3}; // Column-major (incontiguous) + + Tensor* tensor; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides.data(), + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 3); + EXPECT_EQ(tensor->size(1), 4); + + // Verify the strides are what we specified + int64_t* strides_ptr; + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok); + EXPECT_EQ(strides_ptr[0], 1); // Column-major stride for dimension 0 + EXPECT_EQ(strides_ptr[1], 3); // Column-major stride for dimension 1 + + // Verify that memory was allocated correctly for incontiguous layout + // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] - + // 1) + 1 = 1 * (3 - 1) + 3 * (4 - 1) + 1 = 1 * 2 + 3 * 3 + 1 = 2 + 9 + 1 = 12 + // elements Total bytes = 12 * 4 = 48 bytes (for float32) + EXPECT_EQ(tensor->numel(), 12); // numel is still 3*4=12 for logical shape + + // The tensor should be accessible and writable + void* data_ptr = tensor->mutable_data_ptr(); + EXPECT_NE(data_ptr, nullptr); + + // Verify we can use CUDA to write to the memory + std::vector test_data(12, 1.0f); + cudaError_t cuda_err = cudaMemcpy( + data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess); +} + +// Test incontiguous tensor creation - expanded/broadcasted stride pattern +TEST_F(AOTITorchEmptyStridedTest, IncontiguousExpandedStrides) { + // Create a tensor with expanded strides (simulating broadcasting) + // A 2x3x4 tensor where the first dimension has stride 0 (expanded) + // This creates a tensor where the first dimension is "broadcasted" + std::vector sizes = {2, 3, 4}; + std::vector strides = {0, 4, 1}; // First dimension has stride 0 + + Tensor* tensor; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides.data(), + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 3); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); + EXPECT_EQ(tensor->size(2), 4); + + // Verify the strides are what we specified + int64_t* strides_ptr; + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok); + EXPECT_EQ(strides_ptr[0], 0); // Expanded dimension stride + EXPECT_EQ(strides_ptr[1], 4); + EXPECT_EQ(strides_ptr[2], 1); + + // Verify that memory was allocated correctly for this incontiguous layout + // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] - + // 1) + stride[2] * (size[2] - 1) + 1 = 0 * (2 - 1) + 4 * (3 - 1) + 1 * (4 - + // 1) + 1 = 0 + 8 + 3 + 1 = 12 elements Note: numel() returns logical number + // of elements (2*3*4=24), not storage size + EXPECT_EQ(tensor->numel(), 24); // Logical numel is 2*3*4=24 + + // The tensor should be accessible and writable + void* data_ptr = tensor->mutable_data_ptr(); + EXPECT_NE(data_ptr, nullptr); + + // Verify we can use CUDA to write to the allocated memory + // We only need to allocate 12 elements (storage size), not 24 + std::vector test_data(12, 2.0f); + cudaError_t cuda_err = cudaMemcpy( + data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess); +} diff --git a/backends/cuda/runtime/tensor/tensor_maker.cpp b/backends/cuda/runtime/tensor/tensor_maker.cpp new file mode 100644 index 00000000000..01252082bfc --- /dev/null +++ b/backends/cuda/runtime/tensor/tensor_maker.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +namespace executorch::backends::cuda { + +namespace { +#ifndef USE_ATEN_LIB +/** + * A structure that consolidates the metadata (sizes, dim_order, strides) and + * the data buffer associated with a Tensor. Since Tensor does not own + * the memory for these metadata arrays or the data itself, this structure + * ensures that they are managed together and have the same lifetime as the + * Tensor. When the Tensor is destroyed, the Storage structure ensures + * proper cleanup of the associated metadata and data if needed. + */ +struct Storage final { + executorch::aten::TensorImpl tensor_impl; + executorch::aten::Tensor tensor; + std::vector sizes; + std::vector dim_order; + std::vector strides; + std::function deleter; + + Storage( + executorch::aten::TensorImpl&& tensor_impl, + std::vector&& sizes, + std::vector&& dim_order, + std::vector&& strides, + std::function&& deleter) + : tensor_impl(std::move(tensor_impl)), + tensor(&this->tensor_impl), + sizes(std::move(sizes)), + dim_order(std::move(dim_order)), + strides(std::move(strides)), + deleter(std::move(deleter)) {} + + ~Storage() { + if (deleter) { + deleter(tensor_impl.mutable_data()); + } + } +}; +#endif // USE_ATEN_LIB +} // namespace + +TensorPtr make_tensor( + std::vector sizes, + void* data, + std::vector dim_order, + std::vector strides, + executorch::aten::ScalarType type, + executorch::aten::TensorShapeDynamism dynamism, + std::function deleter) { + const auto dim = sizes.size(); + ET_CHECK_MSG( + dim_order.empty() || dim_order.size() == dim, + "dim_order size must match sizes or be empty."); + ET_CHECK_MSG( + strides.empty() || strides.size() == dim, + "strides size must match sizes or be empty."); + + if (dim_order.empty()) { + dim_order.resize(dim); + std::iota(dim_order.begin(), dim_order.end(), 0); + if (!strides.empty()) { + std::sort(dim_order.begin(), dim_order.end(), [&](size_t a, size_t b) { + return strides[a] > strides[b]; + }); + } + } + + // AOTI backends (like AOTI-CUDA) handle both contiguous and incontiguous + // tensors, so we skip stride calculation and incontiguous tensor checks. + // Strides are passed through as-is without validation. + +#ifndef USE_ATEN_LIB + executorch::aten::TensorImpl tensor_impl( + type, + dim, + sizes.data(), + data, + dim_order.data(), + strides.data(), + dim > 0 ? dynamism : executorch::aten::TensorShapeDynamism::STATIC); + auto storage = std::make_shared( + std::move(tensor_impl), + std::move(sizes), + std::move(dim_order), + std::move(strides), + std::move(deleter)); + const auto tensor_ptr = &storage->tensor; + return std::shared_ptr( + std::move(storage), tensor_ptr); +#else + auto options = c10::TensorOptions() + .dtype(c10::scalarTypeToTypeMeta(type)) + .device(c10::kCPU); + auto storage = c10::Storage( + c10::Storage::use_byte_size_t(), + at::detail::computeStorageNbytes( + sizes, strides, options.dtype().itemsize()), + c10::InefficientStdFunctionContext::makeDataPtr( + data, std::move(deleter), options.device()), + nullptr, + false); + auto tensor_impl = c10::make_intrusive( + std::move(storage), + c10::DispatchKeySet(c10::DispatchKey::CPU), + options.dtype()); + tensor_impl->set_sizes_and_strides(sizes, strides); + return std::make_shared(std::move(tensor_impl)); +#endif // USE_ATEN_LIB +} + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/tensor/tensor_maker.h b/backends/cuda/runtime/tensor/tensor_maker.h new file mode 100644 index 00000000000..92cdec60bb4 --- /dev/null +++ b/backends/cuda/runtime/tensor/tensor_maker.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace executorch::backends::cuda { + +/** + * A smart pointer type for managing the lifecycle of a Tensor. + * This is compatible with executorch::extension::TensorPtr. + */ +using TensorPtr = std::shared_ptr; + +/** + * Creates a TensorPtr for AOTI backends that skips stride calculation and + * incontiguous tensor checks. This is specifically designed for AOTI-CUDA + * which handles both contiguous and incontiguous tensors. + * + * This function is similar to executorch::extension::make_tensor_ptr but + * bypasses the stride validation that assumes contiguous tensors, making it + * suitable for AOTI backends that support arbitrary strides. + * + * @param sizes A vector specifying the size of each dimension. + * @param data A pointer to the data buffer. + * @param dim_order A vector specifying the order of dimensions. + * @param strides A vector specifying the strides of the tensor. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies the mutability of the tensor's shape. + * @param deleter A custom deleter function for managing the lifetime of the + * data buffer. If provided, this deleter will be called when the managed Tensor + * object is destroyed. + * @return A TensorPtr that manages the newly created Tensor. + */ +TensorPtr make_tensor( + std::vector sizes, + void* data, + std::vector dim_order, + std::vector strides, + executorch::aten::ScalarType type = executorch::aten::ScalarType::Float, + executorch::aten::TensorShapeDynamism dynamism = + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + std::function deleter = nullptr); + +} // namespace executorch::backends::cuda