diff --git a/backends/cuda/runtime/shims/memory_slim.cpp b/backends/cuda/runtime/shims/memory_slim.cpp index fbf2bd8ed9c..7996d330db5 100644 --- a/backends/cuda/runtime/shims/memory_slim.cpp +++ b/backends/cuda/runtime/shims/memory_slim.cpp @@ -206,6 +206,25 @@ aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) { return Error::Ok; } +AOTITorchError aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst) { + ET_CHECK_OR_RETURN_ERROR( + src != nullptr, + InvalidArgument, + "aoti_torch_assign_tensors_out: src is null"); + + ET_CHECK_OR_RETURN_ERROR( + ret_dst != nullptr, + InvalidArgument, + "aoti_torch_assign_tensors_out: ret_dst is null"); + + // Move the source tensor into the destination. After this operation, + // the source tensor will be left in an undefined state (reset). + // This differs from aoti_torch_new_tensor_handle which copies the tensor. + *ret_dst = new Tensor(std::move(*src)); + + return Error::Ok; +} + } // extern "C" } // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/memory_slim.h b/backends/cuda/runtime/shims/memory_slim.h index 3c6a58fb783..0f293d1e995 100644 --- a/backends/cuda/runtime/shims/memory_slim.h +++ b/backends/cuda/runtime/shims/memory_slim.h @@ -143,6 +143,20 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch__reinterpret_tensor( AOTI_SHIM_EXPORT AOTITorchError aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking); +/** + * Moves a tensor into a new handle and assigns it to the output parameter. + * + * Unlike aoti_torch_new_tensor_handle which copies, this function moves the + * source tensor into the destination. After this operation, the source tensor + * is left in an undefined/reset state and should not be used. + * + * @param src Source tensor to move from (must not be null, will be reset) + * @param ret_dst Output parameter for the new tensor handle + * @return AOTITorchError error code (Error::Ok on success) + */ +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst); + } // extern "C" } // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl index 099759d0649..852c30465af 100644 --- a/backends/cuda/runtime/shims/tests/targets.bzl +++ b/backends/cuda/runtime/shims/tests/targets.bzl @@ -77,3 +77,4 @@ def define_common_targets(): cuda_shim_slim_cpp_unittest("aoti_torch_new_tensor_handle") cuda_shim_slim_cpp_unittest("aoti_torch__reinterpret_tensor") cuda_shim_slim_cpp_unittest("aoti_torch_copy_") + cuda_shim_slim_cpp_unittest("aoti_torch_assign_tensors_out") diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out_slim.cpp new file mode 100644 index 00000000000..f01743745d2 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_assign_tensors_out_slim.cpp @@ -0,0 +1,437 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace executorch::backends::cuda; +using executorch::runtime::Error; + +namespace slim_c10 = executorch::backends::aoti::slim::c10; + +namespace { + +bool isCudaAvailable() { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + return (err == cudaSuccess && device_count > 0); +} + +std::vector calculateContiguousStrides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; + } + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + return strides; +} + +} // namespace + +class AOTITorchAssignTensorsOutSlimTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); + } + + Tensor* createTestTensor( + const std::vector& sizes, + const std::vector& strides = {}, + int32_t dtype = static_cast(slim_c10::ScalarType::Float), + int32_t device_type = static_cast(slim_c10::DeviceType::CPU), + int32_t device_index = 0) { + Tensor* tensor = nullptr; + + std::vector effective_strides = strides; + if (strides.empty()) { + effective_strides = calculateContiguousStrides(sizes); + } + + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + effective_strides.data(), + dtype, + device_type, + device_index, + &tensor); + + return (error == Error::Ok) ? tensor : nullptr; + } +}; + +// ============================================================================ +// Basic Functionality Tests +// ============================================================================ + +TEST_F(AOTITorchAssignTensorsOutSlimTest, BasicFunctionality_CPU) { + std::vector sizes = {2, 3}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + // Store expected properties before move + int64_t expected_dim = src_tensor->dim(); + int64_t expected_size0 = src_tensor->size(0); + int64_t expected_size1 = src_tensor->size(1); + size_t expected_numel = src_tensor->numel(); + void* expected_data_ptr = src_tensor->data_ptr(); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + + // Verify destination tensor has the moved properties + EXPECT_EQ(dst_tensor->dim(), expected_dim); + EXPECT_EQ(dst_tensor->size(0), expected_size0); + EXPECT_EQ(dst_tensor->size(1), expected_size1); + EXPECT_EQ(dst_tensor->numel(), expected_numel); + EXPECT_EQ(dst_tensor->data_ptr(), expected_data_ptr); + + // Source tensor is now in undefined state after move - just delete it + // (accessing src_tensor properties is undefined behavior after move) + delete src_tensor; // Direct delete since it's in undefined state + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +TEST_F(AOTITorchAssignTensorsOutSlimTest, NullSrc) { + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(nullptr, &dst_tensor); + + EXPECT_EQ(error, Error::InvalidArgument); +} + +TEST_F(AOTITorchAssignTensorsOutSlimTest, NullDst) { + std::vector sizes = {2, 3}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, nullptr); + + EXPECT_EQ(error, Error::InvalidArgument); + + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); +} + +// ============================================================================ +// Move Semantics Tests +// ============================================================================ + +TEST_F(AOTITorchAssignTensorsOutSlimTest, SourceBecamesUndefinedAfterMove_CPU) { + std::vector sizes = {3, 4}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + void* original_ptr = src_tensor->data_ptr(); + ASSERT_NE(original_ptr, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + + // Destination has the original pointer + EXPECT_EQ(dst_tensor->data_ptr(), original_ptr); + + // Source tensor is now in undefined state - verify it's no longer defined + EXPECT_FALSE(src_tensor->defined()); + + // Clean up - delete in this order since src is undefined + delete src_tensor; + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +// ============================================================================ +// Tensor Property Tests +// ============================================================================ + +TEST_F(AOTITorchAssignTensorsOutSlimTest, CustomStrides_CPU) { + std::vector sizes = {3, 4}; + std::vector strides = {4, 1}; + Tensor* src_tensor = createTestTensor( + sizes, + strides, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + // Store expected strides before move + int64_t expected_stride0 = src_tensor->stride(0); + int64_t expected_stride1 = src_tensor->stride(1); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + + // Verify destination has the expected strides + EXPECT_EQ(dst_tensor->stride(0), expected_stride0); + EXPECT_EQ(dst_tensor->stride(1), expected_stride1); + + delete src_tensor; // Source is undefined after move + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +TEST_F(AOTITorchAssignTensorsOutSlimTest, ScalarTensor_CPU) { + std::vector sizes = {}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + EXPECT_EQ(src_tensor->dim(), 0); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + + EXPECT_EQ(dst_tensor->dim(), 0); + EXPECT_EQ(dst_tensor->numel(), 1); + + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +TEST_F(AOTITorchAssignTensorsOutSlimTest, LargeMultiDimensionalTensor_CPU) { + std::vector sizes = {10, 20, 30}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + + EXPECT_EQ(dst_tensor->dim(), 3); + EXPECT_EQ(dst_tensor->size(0), 10); + EXPECT_EQ(dst_tensor->size(1), 20); + EXPECT_EQ(dst_tensor->size(2), 30); + EXPECT_EQ(dst_tensor->numel(), 6000); + + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +// ============================================================================ +// Different Dtype Tests +// ============================================================================ + +TEST_F(AOTITorchAssignTensorsOutSlimTest, Int64Tensor_CPU) { + std::vector sizes = {2, 3}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Long), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + + EXPECT_EQ(dst_tensor->itemsize(), 8); + + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +TEST_F(AOTITorchAssignTensorsOutSlimTest, BFloat16Tensor_CPU) { + std::vector sizes = {2, 3, 4}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::BFloat16), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + + EXPECT_EQ(dst_tensor->itemsize(), 2); + + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +TEST_F(AOTITorchAssignTensorsOutSlimTest, BoolTensor_CPU) { + std::vector sizes = {4}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Bool), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(src_tensor, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + + EXPECT_EQ(dst_tensor->itemsize(), 1); + + EXPECT_EQ(aoti_torch_delete_tensor_object(src_tensor), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +// ============================================================================ +// CUDA Tests +// ============================================================================ + +TEST_F(AOTITorchAssignTensorsOutSlimTest, BasicFunctionality_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {2, 3}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(src_tensor, nullptr); + EXPECT_TRUE(src_tensor->is_cuda()); + + // Store expected properties before move + void* expected_data_ptr = src_tensor->data_ptr(); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + EXPECT_TRUE(dst_tensor->is_cuda()); + EXPECT_EQ(dst_tensor->data_ptr(), expected_data_ptr); + + // Source is undefined after move + EXPECT_FALSE(src_tensor->defined()); + + delete src_tensor; + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +TEST_F( + AOTITorchAssignTensorsOutSlimTest, + SourceBecamesUndefinedAfterMove_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {3, 4}; + Tensor* src_tensor = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(src_tensor, nullptr); + + void* original_ptr = src_tensor->data_ptr(); + ASSERT_NE(original_ptr, nullptr); + + Tensor* dst_tensor = nullptr; + AOTITorchError error = aoti_torch_assign_tensors_out(src_tensor, &dst_tensor); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(dst_tensor, nullptr); + + // Destination has the original pointer + EXPECT_EQ(dst_tensor->data_ptr(), original_ptr); + + // Source tensor is now in undefined state + EXPECT_FALSE(src_tensor->defined()); + + delete src_tensor; + EXPECT_EQ(aoti_torch_delete_tensor_object(dst_tensor), Error::Ok); +} + +// ============================================================================ +// Mixed Device Tests +// ============================================================================ + +TEST_F(AOTITorchAssignTensorsOutSlimTest, MixedDeviceAssignments) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + std::vector sizes = {2, 3}; + + Tensor* cpu_src = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0); + ASSERT_NE(cpu_src, nullptr); + EXPECT_TRUE(cpu_src->is_cpu()); + + Tensor* cuda_src = createTestTensor( + sizes, + {}, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0); + ASSERT_NE(cuda_src, nullptr); + EXPECT_TRUE(cuda_src->is_cuda()); + + Tensor* cpu_dst = nullptr; + Tensor* cuda_dst = nullptr; + + EXPECT_EQ(aoti_torch_assign_tensors_out(cpu_src, &cpu_dst), Error::Ok); + EXPECT_EQ(aoti_torch_assign_tensors_out(cuda_src, &cuda_dst), Error::Ok); + + EXPECT_TRUE(cpu_dst->is_cpu()); + EXPECT_TRUE(cuda_dst->is_cuda()); + EXPECT_NE(cpu_dst->data_ptr(), cuda_dst->data_ptr()); + + EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_src), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(cpu_dst), Error::Ok); + EXPECT_EQ(aoti_torch_delete_tensor_object(cuda_dst), Error::Ok); +}