diff --git a/backends/aoti/slim/core/SlimTensor.h b/backends/aoti/slim/core/SlimTensor.h index f3ab9f3fec3..c662202493d 100644 --- a/backends/aoti/slim/core/SlimTensor.h +++ b/backends/aoti/slim/core/SlimTensor.h @@ -227,6 +227,13 @@ class SlimTensor { return device().is_cpu(); } + /** + * Check if the tensor is on CUDA. + */ + bool is_cuda() const { + return device().is_cuda(); + } + /** * Check if the tensor is defined (has valid storage). */ diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl index d0ee397c112..17ac00ff37a 100644 --- a/backends/aoti/slim/core/targets.bzl +++ b/backends/aoti/slim/core/targets.bzl @@ -22,7 +22,6 @@ def define_common_targets(): ], ) - # Header-only library for SlimTensor (CPU-only for now) runtime.cxx_library( name = "slimtensor", headers = [ @@ -37,6 +36,8 @@ def define_common_targets(): "//executorch/backends/aoti/slim/c10/core:sizes_and_strides", "//executorch/backends/aoti/slim/util:array_ref_util", "//executorch/backends/aoti/slim/util:size_util", + "//executorch/backends/aoti/slim/c10/cuda:exception", + "//executorch/backends/aoti/slim/cuda:guard", "//executorch/runtime/platform:platform", ], ) diff --git a/backends/aoti/slim/core/test/targets.bzl b/backends/aoti/slim/core/test/targets.bzl index 3a7e99dd37c..3400fd943e8 100644 --- a/backends/aoti/slim/core/test/targets.bzl +++ b/backends/aoti/slim/core/test/targets.bzl @@ -32,16 +32,17 @@ def define_common_targets(): **backend_kwargs ) - runtime.cxx_test( - name = "test_slimtensor_basic", - srcs = [ - "test_slimtensor_basic.cpp", - ], - deps = [ - "//executorch/backends/aoti/slim/core:slimtensor", - "//executorch/backends/aoti/slim/core:storage", - ], - ) + runtime.cxx_test( + name = "test_slimtensor_basic" + backend_suffix, + srcs = [ + "test_slimtensor_basic.cpp", + ], + deps = [ + "//executorch/backends/aoti/slim/core:slimtensor", + "//executorch/backends/aoti/slim/core:storage", + ], + **backend_kwargs + ) runtime.cxx_test( name = "test_slimtensor_copy", diff --git a/backends/aoti/slim/core/test/test_slimtensor_basic.cpp b/backends/aoti/slim/core/test/test_slimtensor_basic.cpp index ee1b50898a2..adcf2495cb8 100644 --- a/backends/aoti/slim/core/test/test_slimtensor_basic.cpp +++ b/backends/aoti/slim/core/test/test_slimtensor_basic.cpp @@ -21,6 +21,9 @@ namespace executorch::backends::aoti::slim { inline std::vector get_test_devices() { std::vector devices; devices.push_back(CPU_DEVICE); +#ifdef CUDA_AVAILABLE + devices.push_back(DEFAULT_CUDA_DEVICE); +#endif return devices; } @@ -52,7 +55,9 @@ INSTANTIATE_TEST_SUITE_P( DeviceTests, SlimTensorBasicDeviceTest, ::testing::ValuesIn(get_test_devices()), - [](const ::testing::TestParamInfo& info) { return "CPU"; }); + [](const ::testing::TestParamInfo& info) { + return info.param.is_cuda() ? "CUDA" : "CPU"; + }); // ============================================================================= // Constructor Tests (Device-Parameterized) diff --git a/backends/aoti/slim/factory/Empty.h b/backends/aoti/slim/factory/Empty.h index 24b4f53a647..c0ab9d7248d 100644 --- a/backends/aoti/slim/factory/Empty.h +++ b/backends/aoti/slim/factory/Empty.h @@ -23,7 +23,7 @@ namespace executorch::backends::aoti::slim { /// @param sizes The sizes of each dimension. /// @param strides The strides of each dimension. /// @param dtype The scalar type of tensor elements. -/// @param device The target device (must be CPU). +/// @param device The target device. /// @return A new SlimTensor with allocated but uninitialized storage. inline SlimTensor empty_strided( IntArrayRef sizes, @@ -41,7 +41,7 @@ inline SlimTensor empty_strided( /// /// @param sizes The sizes of each dimension. /// @param dtype The scalar type of tensor elements. -/// @param device The target device (must be CPU). +/// @param device The target device. /// @return A new SlimTensor with contiguous strides and uninitialized storage. inline SlimTensor empty( IntArrayRef sizes, @@ -59,7 +59,7 @@ inline SlimTensor empty( /// /// @param sizes The sizes of each dimension as an initializer list. /// @param dtype The scalar type of tensor elements. -/// @param device The target device (must be CPU). +/// @param device The target device. /// @return A new SlimTensor with contiguous strides and uninitialized storage. inline SlimTensor empty( std::initializer_list sizes, diff --git a/backends/aoti/slim/factory/test/targets.bzl b/backends/aoti/slim/factory/test/targets.bzl index a64510b2af1..7bad3067cc0 100644 --- a/backends/aoti/slim/factory/test/targets.bzl +++ b/backends/aoti/slim/factory/test/targets.bzl @@ -1,14 +1,33 @@ +load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils") load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +def get_backend_mode(): + """Get the supported backend mode of slimtensor.""" + return ["cuda", "cpu"] + def define_common_targets(): """Define test targets for SlimTensor factory module.""" - runtime.cxx_test( - name = "test_empty", - srcs = [ - "test_empty.cpp", - ], - deps = [ - "//executorch/backends/aoti/slim/factory:empty", - ], - ) + # GPU empty test with CUDA support + for backend_mode in get_backend_mode(): + backend_suffix = "_" + backend_mode if backend_mode == "cuda" else "" + + backend_kwargs = { + "external_deps": [("cuda", None, "cuda-lazy")], + "preprocessor_flags": ["-DCUDA_AVAILABLE=1"], + "keep_gpu_sections": True, + "remote_execution": re_test_utils.remote_execution( + platform = "gpu-remote-execution", + ), + } if backend_mode == "cuda" else {} + + runtime.cxx_test( + name = "test_empty" + backend_suffix, + srcs = [ + "test_empty.cpp", + ], + deps = [ + "//executorch/backends/aoti/slim/factory:empty", + ], + **backend_kwargs + ) diff --git a/backends/aoti/slim/factory/test/test_empty.cpp b/backends/aoti/slim/factory/test/test_empty.cpp index 7d7c9cafc34..18e7ead14ef 100644 --- a/backends/aoti/slim/factory/test/test_empty.cpp +++ b/backends/aoti/slim/factory/test/test_empty.cpp @@ -10,6 +10,10 @@ #include +#ifdef CUDA_AVAILABLE +#include +#endif + namespace executorch::backends::aoti::slim { // ============================================================================= @@ -229,4 +233,257 @@ TEST(EmptyTest, CanWriteAndReadData) { } } +#ifdef CUDA_AVAILABLE + +// ============================================================================= +// CUDA Empty Tensor Tests +// Tests are skipped at runtime if CUDA hardware is not available. +// ============================================================================= + +// ============================================================================= +// empty_strided() CUDA Tests +// ============================================================================= + +TEST(EmptyStridedCUDATest, Basic2x3Tensor) { + std::vector sizes = {2, 3}; + std::vector strides = {3, 1}; + + SlimTensor tensor = empty_strided( + makeArrayRef(sizes), + makeArrayRef(strides), + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); + + EXPECT_TRUE(tensor.defined()); + EXPECT_EQ(tensor.dim(), 2u); + EXPECT_EQ(tensor.numel(), 6u); + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Float); + EXPECT_TRUE(tensor.is_cuda()); + EXPECT_FALSE(tensor.is_cpu()); + + auto result_sizes = tensor.sizes(); + EXPECT_EQ(result_sizes[0], 2); + EXPECT_EQ(result_sizes[1], 3); + + auto result_strides = tensor.strides(); + EXPECT_EQ(result_strides[0], 3); + EXPECT_EQ(result_strides[1], 1); +} + +TEST(EmptyStridedCUDATest, ContiguousTensor) { + std::vector sizes = {2, 3, 4}; + std::vector strides = {12, 4, 1}; + + SlimTensor tensor = empty_strided( + makeArrayRef(sizes), + makeArrayRef(strides), + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); + + EXPECT_TRUE(tensor.is_contiguous()); + EXPECT_EQ(tensor.numel(), 24u); + EXPECT_EQ(tensor.nbytes(), 24 * sizeof(float)); + EXPECT_TRUE(tensor.is_cuda()); +} + +TEST(EmptyStridedCUDATest, NonContiguousTensor) { + std::vector sizes = {3, 2}; + std::vector strides = {1, 3}; + + SlimTensor tensor = empty_strided( + makeArrayRef(sizes), + makeArrayRef(strides), + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); + + EXPECT_FALSE(tensor.is_contiguous()); + EXPECT_EQ(tensor.numel(), 6u); + EXPECT_TRUE(tensor.is_cuda()); +} + +TEST(EmptyStridedCUDATest, OneDimensional) { + std::vector sizes = {10}; + std::vector strides = {1}; + + SlimTensor tensor = empty_strided( + makeArrayRef(sizes), + makeArrayRef(strides), + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); + + EXPECT_EQ(tensor.dim(), 1u); + EXPECT_EQ(tensor.numel(), 10u); + EXPECT_TRUE(tensor.is_contiguous()); + EXPECT_TRUE(tensor.is_cuda()); +} + +TEST(EmptyStridedCUDATest, ZeroSizedTensor) { + std::vector sizes = {0, 3}; + std::vector strides = {3, 1}; + + SlimTensor tensor = empty_strided( + makeArrayRef(sizes), + makeArrayRef(strides), + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); + + EXPECT_TRUE(tensor.defined()); + EXPECT_EQ(tensor.numel(), 0u); + EXPECT_TRUE(tensor.is_empty()); + EXPECT_TRUE(tensor.is_cuda()); +} + +TEST(EmptyStridedCUDATest, LargeDimensionalTensor) { + std::vector sizes = {2, 3, 4, 5}; + std::vector strides = {60, 20, 5, 1}; + + SlimTensor tensor = empty_strided( + makeArrayRef(sizes), + makeArrayRef(strides), + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); + + EXPECT_EQ(tensor.dim(), 4u); + EXPECT_EQ(tensor.numel(), 120u); + EXPECT_TRUE(tensor.is_contiguous()); + EXPECT_TRUE(tensor.is_cuda()); +} + +// ============================================================================= +// empty() CUDA Tests +// ============================================================================= + +TEST(EmptyCUDATest, BasicWithArrayRef) { + std::vector sizes = {2, 3, 4}; + + SlimTensor tensor = + empty(makeArrayRef(sizes), c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + EXPECT_TRUE(tensor.defined()); + EXPECT_EQ(tensor.dim(), 3u); + EXPECT_EQ(tensor.numel(), 24u); + EXPECT_TRUE(tensor.is_contiguous()); + EXPECT_TRUE(tensor.is_cuda()); +} + +TEST(EmptyCUDATest, VerifiesContiguousStrides) { + std::vector sizes = {2, 3, 4}; + + SlimTensor tensor = + empty(makeArrayRef(sizes), c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + auto strides = tensor.strides(); + EXPECT_EQ(strides[0], 12); + EXPECT_EQ(strides[1], 4); + EXPECT_EQ(strides[2], 1); + EXPECT_TRUE(tensor.is_cuda()); +} + +TEST(EmptyCUDATest, InitializerListOverload) { + SlimTensor tensor = + empty({4, 5, 6}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + EXPECT_EQ(tensor.dim(), 3u); + EXPECT_EQ(tensor.numel(), 120u); + EXPECT_TRUE(tensor.is_contiguous()); + EXPECT_TRUE(tensor.is_cuda()); + + auto sizes = tensor.sizes(); + EXPECT_EQ(sizes[0], 4); + EXPECT_EQ(sizes[1], 5); + EXPECT_EQ(sizes[2], 6); +} + +TEST(EmptyCUDATest, OneDimensional) { + SlimTensor tensor = empty({10}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + EXPECT_EQ(tensor.dim(), 1u); + EXPECT_EQ(tensor.numel(), 10u); + EXPECT_EQ(tensor.stride(0), 1); + EXPECT_TRUE(tensor.is_cuda()); +} + +TEST(EmptyCUDATest, ZeroSized) { + SlimTensor tensor = + empty({0, 5}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + EXPECT_TRUE(tensor.is_empty()); + EXPECT_EQ(tensor.numel(), 0u); + EXPECT_TRUE(tensor.is_cuda()); +} + +// ============================================================================= +// empty_like() CUDA Tests +// ============================================================================= + +TEST(EmptyLikeCUDATest, CopiesMetadata) { + std::vector sizes = {2, 3, 4}; + std::vector strides = {12, 4, 1}; + + SlimTensor original = empty_strided( + makeArrayRef(sizes), + makeArrayRef(strides), + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); + SlimTensor copy = empty_like(original); + + EXPECT_EQ(copy.dim(), original.dim()); + EXPECT_EQ(copy.numel(), original.numel()); + EXPECT_EQ(copy.dtype(), original.dtype()); + EXPECT_EQ(copy.is_cuda(), original.is_cuda()); + EXPECT_EQ(copy.is_contiguous(), original.is_contiguous()); + + for (size_t i = 0; i < copy.dim(); i++) { + EXPECT_EQ(copy.size(i), original.size(i)); + EXPECT_EQ(copy.stride(i), original.stride(i)); + } +} + +TEST(EmptyLikeCUDATest, HasDifferentStorage) { + SlimTensor original = + empty({2, 3}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + SlimTensor copy = empty_like(original); + + EXPECT_NE(original.data_ptr(), copy.data_ptr()); + EXPECT_TRUE(copy.is_cuda()); +} + +TEST(EmptyLikeCUDATest, NonContiguousTensor) { + std::vector sizes = {3, 2}; + std::vector strides = {1, 3}; + + SlimTensor original = empty_strided( + makeArrayRef(sizes), + makeArrayRef(strides), + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); + SlimTensor copy = empty_like(original); + + EXPECT_FALSE(copy.is_contiguous()); + EXPECT_EQ(copy.stride(0), 1); + EXPECT_EQ(copy.stride(1), 3); + EXPECT_TRUE(copy.is_cuda()); +} + +// ============================================================================= +// CUDA Data Access Tests +// ============================================================================= + +TEST(EmptyCUDATest, DataPtrIsValid) { + SlimTensor tensor = + empty({2, 3}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + void* data = tensor.data_ptr(); + EXPECT_NE(data, nullptr); +} + +TEST(EmptyCUDATest, DeviceIndex) { + SlimTensor tensor = + empty({2, 3}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + EXPECT_EQ(tensor.device().index(), 0); +} + +#endif // CUDA_AVAILABLE + } // namespace executorch::backends::aoti::slim