diff --git a/backends/aoti/slim/factory/FromBlob.h b/backends/aoti/slim/factory/FromBlob.h new file mode 100644 index 00000000000..b0c659419e9 --- /dev/null +++ b/backends/aoti/slim/factory/FromBlob.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +namespace executorch::backends::aoti::slim { + +/// Creates a SlimTensor that wraps external memory without taking ownership. +/// The returned tensor does NOT own the underlying storage; the caller is +/// responsible for keeping the data alive for the lifetime of the tensor. +/// +/// @param data Pointer to external memory (must not be null). +/// @param sizes The sizes of each dimension. +/// @param strides The strides of each dimension. +/// @param dtype The scalar type of tensor elements. +/// @param device The device where the data resides. +/// @param storage_offset Offset into storage in number of elements. +/// @return A new SlimTensor with non-owning storage. +inline SlimTensor from_blob( + void* data, + IntArrayRef sizes, + IntArrayRef strides, + c10::ScalarType dtype, + const c10::Device& device = CPU_DEVICE, + int64_t storage_offset = 0) { + ET_CHECK_MSG(data != nullptr, "from_blob: data pointer cannot be nullptr"); + + size_t nbytes = compute_storage_nbytes( + sizes, strides, c10::elementSize(dtype), storage_offset); + + Storage storage(new MaybeOwningStorage(device, data, nbytes)); + return SlimTensor(std::move(storage), sizes, strides, dtype, storage_offset); +} + +/// Creates a contiguous SlimTensor that wraps external memory. +/// Computes contiguous strides automatically. +/// +/// @param data Pointer to external memory (must not be null). +/// @param sizes The sizes of each dimension. +/// @param dtype The scalar type of tensor elements. +/// @param device The device where the data resides. +/// @param storage_offset Offset into storage in number of elements. +/// @return A new SlimTensor with non-owning storage and contiguous strides. +inline SlimTensor from_blob( + void* data, + IntArrayRef sizes, + c10::ScalarType dtype, + const c10::Device& device = CPU_DEVICE, + int64_t storage_offset = 0) { + std::vector contig_strides = compute_contiguous_strides(sizes); + return from_blob( + data, sizes, makeArrayRef(contig_strides), dtype, device, storage_offset); +} + +/// Creates a contiguous SlimTensor from an initializer list of sizes. +/// +/// @param data Pointer to external memory (must not be null). +/// @param sizes The sizes as an initializer list. +/// @param dtype The scalar type of tensor elements. +/// @param device The device where the data resides. +/// @param storage_offset Offset into storage in number of elements. +/// @return A new SlimTensor with non-owning storage and contiguous strides. +inline SlimTensor from_blob( + void* data, + std::initializer_list sizes, + c10::ScalarType dtype, + const c10::Device& device = CPU_DEVICE, + int64_t storage_offset = 0) { + return from_blob(data, makeArrayRef(sizes), dtype, device, storage_offset); +} + +/// Creates a SlimTensor from initializer lists for both sizes and strides. +/// +/// @param data Pointer to external memory (must not be null). +/// @param sizes The sizes as an initializer list. +/// @param strides The strides as an initializer list. +/// @param dtype The scalar type of tensor elements. +/// @param device The device where the data resides. +/// @param storage_offset Offset into storage in number of elements. +/// @return A new SlimTensor with non-owning storage. +inline SlimTensor from_blob( + void* data, + std::initializer_list sizes, + std::initializer_list strides, + c10::ScalarType dtype, + const c10::Device& device = CPU_DEVICE, + int64_t storage_offset = 0) { + return from_blob( + data, + makeArrayRef(sizes), + makeArrayRef(strides), + dtype, + device, + storage_offset); +} + +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/factory/targets.bzl b/backends/aoti/slim/factory/targets.bzl index d6dc41aa877..5b10967e166 100644 --- a/backends/aoti/slim/factory/targets.bzl +++ b/backends/aoti/slim/factory/targets.bzl @@ -16,3 +16,16 @@ def define_common_targets(): "//executorch/backends/aoti/slim/util:size_util", ], ) + + runtime.cxx_library( + name = "from_blob", + headers = [ + "FromBlob.h", + ], + visibility = ["@EXECUTORCH_CLIENTS"], + exported_deps = [ + "//executorch/backends/aoti/slim/core:slimtensor", + "//executorch/backends/aoti/slim/util:array_ref_util", + "//executorch/backends/aoti/slim/util:size_util", + ], + ) diff --git a/backends/aoti/slim/factory/test/targets.bzl b/backends/aoti/slim/factory/test/targets.bzl index 7bad3067cc0..668d7f75385 100644 --- a/backends/aoti/slim/factory/test/targets.bzl +++ b/backends/aoti/slim/factory/test/targets.bzl @@ -31,3 +31,16 @@ def define_common_targets(): ], **backend_kwargs ) + + runtime.cxx_test( + name = "test_from_blob" + backend_suffix, + srcs = [ + "test_from_blob.cpp", + ], + deps = [ + "//executorch/backends/aoti/slim/core:storage", + "//executorch/backends/aoti/slim/factory:from_blob", + "//executorch/backends/aoti/slim/factory:empty", + ], + **backend_kwargs + ) diff --git a/backends/aoti/slim/factory/test/test_from_blob.cpp b/backends/aoti/slim/factory/test/test_from_blob.cpp new file mode 100644 index 00000000000..16d43d545f3 --- /dev/null +++ b/backends/aoti/slim/factory/test/test_from_blob.cpp @@ -0,0 +1,782 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +#ifdef CUDA_AVAILABLE +#include +#endif + +namespace executorch::backends::aoti::slim { + +// ============================================================================= +// from_blob Basic Tests +// ============================================================================= + +TEST(FromBlobTest, BasicConstruction) { + constexpr size_t kNumFloats = 24; + float external_data[kNumFloats]; + + // Initialize external data + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = static_cast(i) * 1.5f; + } + + SlimTensor tensor = + from_blob(external_data, {2, 3, 4}, c10::ScalarType::Float); + + // Verify tensor properties + EXPECT_EQ(tensor.numel(), kNumFloats); + EXPECT_EQ(tensor.dim(), 3); + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 3); + EXPECT_EQ(tensor.size(2), 4); + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Float); + EXPECT_TRUE(tensor.is_cpu()); + EXPECT_TRUE(tensor.is_contiguous()); + EXPECT_EQ(tensor.storage_offset(), 0); + + // Verify data pointer points to external data + EXPECT_EQ(tensor.data_ptr(), static_cast(external_data)); + + // Verify data is accessible through tensor + float* data = static_cast(tensor.data_ptr()); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(data[i], static_cast(i) * 1.5f); + } +} + +TEST(FromBlobTest, ModifyThroughTensor) { + constexpr size_t kNumFloats = 16; + float external_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = 0.0f; + } + + SlimTensor tensor = from_blob(external_data, {4, 4}, c10::ScalarType::Float); + + // Modify through tensor + float* data = static_cast(tensor.data_ptr()); + for (size_t i = 0; i < kNumFloats; ++i) { + data[i] = static_cast(i) * 10.0f; + } + + // Verify external data was modified + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(external_data[i], static_cast(i) * 10.0f); + } +} + +TEST(FromBlobTest, ExternalDataSurvivesTensorDestruction) { + constexpr size_t kNumFloats = 8; + float external_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = static_cast(i) * 2.0f; + } + + { + SlimTensor tensor = + from_blob(external_data, {2, 4}, c10::ScalarType::Float); + + // Modify through tensor + float* data = static_cast(tensor.data_ptr()); + data[0] = 999.0f; + } + // tensor is destroyed here + + // External data should still be accessible + EXPECT_FLOAT_EQ(external_data[0], 999.0f); + for (size_t i = 1; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(external_data[i], static_cast(i) * 2.0f); + } +} + +// ============================================================================= +// from_blob with Strides Tests +// ============================================================================= + +TEST(FromBlobTest, CustomStrides) { + constexpr size_t kBufferSize = 16; + float external_data[kBufferSize]; + + for (size_t i = 0; i < kBufferSize; ++i) { + external_data[i] = static_cast(i); + } + + // Create a 2x3 tensor with custom strides (transpose-like) + SlimTensor tensor = from_blob( + external_data, + {2, 3}, + {1, 4}, // Non-contiguous strides + c10::ScalarType::Float); + + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 3); + EXPECT_EQ(tensor.stride(0), 1); + EXPECT_EQ(tensor.stride(1), 4); + EXPECT_FALSE(tensor.is_contiguous()); +} + +TEST(FromBlobTest, WithStorageOffset) { + constexpr size_t kBufferSize = 20; + float external_data[kBufferSize]; + + for (size_t i = 0; i < kBufferSize; ++i) { + external_data[i] = static_cast(i); + } + + // Create tensor with offset of 5 elements + SlimTensor tensor = from_blob( + external_data, + {3, 4}, + c10::ScalarType::Float, + CPU_DEVICE, + 5); // storage_offset = 5 + + EXPECT_EQ(tensor.storage_offset(), 5); + EXPECT_EQ(tensor.numel(), 12); + + // data_ptr() should point to external_data + 5 * sizeof(float) + EXPECT_EQ(tensor.data_ptr(), static_cast(external_data + 5)); + + // Verify first element is external_data[5] + float* data = static_cast(tensor.data_ptr()); + EXPECT_FLOAT_EQ(data[0], 5.0f); +} + +// ============================================================================= +// from_blob with Different DTypes Tests +// ============================================================================= + +TEST(FromBlobTest, Int64Dtype) { + constexpr size_t kNumElements = 6; + int64_t external_data[kNumElements] = {10, 20, 30, 40, 50, 60}; + + SlimTensor tensor = from_blob(external_data, {2, 3}, c10::ScalarType::Long); + + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Long); + EXPECT_EQ(tensor.itemsize(), sizeof(int64_t)); + EXPECT_EQ(tensor.numel(), kNumElements); + + int64_t* data = static_cast(tensor.data_ptr()); + EXPECT_EQ(data[0], 10); + EXPECT_EQ(data[5], 60); +} + +TEST(FromBlobTest, Int8Dtype) { + constexpr size_t kNumElements = 10; + int8_t external_data[kNumElements]; + + for (size_t i = 0; i < kNumElements; ++i) { + external_data[i] = static_cast(i); + } + + SlimTensor tensor = from_blob(external_data, {10}, c10::ScalarType::Char); + + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Char); + EXPECT_EQ(tensor.itemsize(), sizeof(int8_t)); + EXPECT_EQ(tensor.dim(), 1); + + int8_t* data = static_cast(tensor.data_ptr()); + for (size_t i = 0; i < kNumElements; ++i) { + EXPECT_EQ(data[i], static_cast(i)); + } +} + +TEST(FromBlobTest, BoolDtype) { + bool external_data[] = {true, false, true, false, true, true}; + + SlimTensor tensor = from_blob(external_data, {2, 3}, c10::ScalarType::Bool); + + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Bool); + EXPECT_EQ(tensor.numel(), 6); + + bool* data = static_cast(tensor.data_ptr()); + EXPECT_TRUE(data[0]); + EXPECT_FALSE(data[1]); + EXPECT_TRUE(data[2]); +} + +// ============================================================================= +// from_blob Copy Tests +// ============================================================================= + +TEST(FromBlobTest, CopyToOwnedTensor) { + constexpr size_t kNumFloats = 12; + float external_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = static_cast(i) * 3.0f; + } + + SlimTensor src = from_blob(external_data, {3, 4}, c10::ScalarType::Float); + SlimTensor dst = empty({3, 4}, c10::ScalarType::Float); + + dst.copy_(src); + + // Verify dst has the data + float* dst_data = static_cast(dst.data_ptr()); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(dst_data[i], static_cast(i) * 3.0f); + } + + // Verify dst is independent of src + external_data[0] = 999.0f; + EXPECT_FLOAT_EQ(dst_data[0], 0.0f); +} + +TEST(FromBlobTest, TensorCopyToFromBlob) { + constexpr size_t kNumFloats = 6; + float src_data[kNumFloats]; + float dst_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + src_data[i] = static_cast(i) * 5.0f; + dst_data[i] = 0.0f; + } + + SlimTensor src = from_blob(src_data, {2, 3}, c10::ScalarType::Float); + SlimTensor dst = from_blob(dst_data, {2, 3}, c10::ScalarType::Float); + + dst.copy_(src); + + // Verify dst_data was modified + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(dst_data[i], static_cast(i) * 5.0f); + } +} + +// ============================================================================= +// from_blob Shared Storage Tests +// ============================================================================= + +TEST(FromBlobTest, CopiedTensorSharesStorage) { + constexpr size_t kNumFloats = 8; + float external_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = static_cast(i); + } + + SlimTensor tensor1 = from_blob(external_data, {2, 4}, c10::ScalarType::Float); + SlimTensor tensor2 = tensor1; // Copy constructor + + // Both should point to same storage + EXPECT_EQ(tensor1.data_ptr(), tensor2.data_ptr()); + EXPECT_EQ(tensor1.storage().get(), tensor2.storage().get()); + + // Modification through tensor2 affects tensor1 + float* data2 = static_cast(tensor2.data_ptr()); + data2[0] = 100.0f; + + float* data1 = static_cast(tensor1.data_ptr()); + EXPECT_FLOAT_EQ(data1[0], 100.0f); + + // And external data + EXPECT_FLOAT_EQ(external_data[0], 100.0f); +} + +// ============================================================================= +// from_blob with ArrayRef Tests +// ============================================================================= + +TEST(FromBlobTest, WithArrayRef) { + constexpr size_t kNumFloats = 6; + float external_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = static_cast(i); + } + + std::vector sizes = {2, 3}; + std::vector strides = {3, 1}; + + SlimTensor tensor = from_blob( + external_data, + makeArrayRef(sizes), + makeArrayRef(strides), + c10::ScalarType::Float); + + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 3); + EXPECT_EQ(tensor.stride(0), 3); + EXPECT_EQ(tensor.stride(1), 1); + EXPECT_TRUE(tensor.is_contiguous()); +} + +// ============================================================================= +// CUDA from_blob Tests +// Tests are skipped at runtime if CUDA hardware is not available. +// ============================================================================= + +#ifdef CUDA_AVAILABLE + +// ============================================================================= +// from_blob CUDA Basic Tests +// ============================================================================= + +TEST(FromBlobCUDATest, BasicConstruction) { + constexpr size_t kNumFloats = 24; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Allocate CUDA memory + float* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + // Initialize via CPU buffer + float* cpu_buffer = static_cast( + DeviceTraits::allocate(kNbytes)); + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_buffer[i] = static_cast(i) * 1.5f; + } + DeviceTraits::memcpy( + cuda_data, cpu_buffer, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + SlimTensor tensor = from_blob( + cuda_data, {2, 3, 4}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Verify tensor properties + EXPECT_EQ(tensor.numel(), kNumFloats); + EXPECT_EQ(tensor.dim(), 3); + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 3); + EXPECT_EQ(tensor.size(2), 4); + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Float); + EXPECT_TRUE(tensor.is_cuda()); + EXPECT_FALSE(tensor.is_cpu()); + EXPECT_TRUE(tensor.is_contiguous()); + EXPECT_EQ(tensor.storage_offset(), 0); + + // Verify data pointer points to CUDA data + EXPECT_EQ(tensor.data_ptr(), static_cast(cuda_data)); + + // Verify data is accessible by copying back to CPU + float* verify_buffer = static_cast( + DeviceTraits::allocate(kNbytes)); + DeviceTraits::memcpy( + verify_buffer, cuda_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 1.5f); + } + + // Clean up + DeviceTraits::free(cpu_buffer); + DeviceTraits::free(verify_buffer); + DeviceTraits::free(cuda_data); +} + +TEST(FromBlobCUDATest, ExternalDataSurvivesTensorDestruction) { + constexpr size_t kNumFloats = 8; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Allocate CUDA memory + float* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + // Initialize via CPU buffer + float* cpu_buffer = static_cast( + DeviceTraits::allocate(kNbytes)); + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_buffer[i] = static_cast(i) * 2.0f; + } + DeviceTraits::memcpy( + cuda_data, cpu_buffer, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + { + SlimTensor tensor = from_blob( + cuda_data, {2, 4}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Modify first element via CPU buffer and copy back + cpu_buffer[0] = 999.0f; + DeviceTraits::memcpy( + cuda_data, cpu_buffer, sizeof(float), DEFAULT_CUDA_DEVICE, CPU_DEVICE); + } + // tensor is destroyed here + + // External CUDA data should still be accessible + float* verify_buffer = static_cast( + DeviceTraits::allocate(kNbytes)); + DeviceTraits::memcpy( + verify_buffer, cuda_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + EXPECT_FLOAT_EQ(verify_buffer[0], 999.0f); + for (size_t i = 1; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 2.0f); + } + + // Clean up + DeviceTraits::free(cpu_buffer); + DeviceTraits::free(verify_buffer); + DeviceTraits::free(cuda_data); +} + +// ============================================================================= +// from_blob CUDA with Strides Tests +// ============================================================================= + +TEST(FromBlobCUDATest, CustomStrides) { + constexpr size_t kBufferSize = 16; + constexpr size_t kNbytes = kBufferSize * sizeof(float); + + float* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + // Create a 2x3 tensor with custom strides (transpose-like) + SlimTensor tensor = from_blob( + cuda_data, + {2, 3}, + {1, 4}, // Non-contiguous strides + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); + + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 3); + EXPECT_EQ(tensor.stride(0), 1); + EXPECT_EQ(tensor.stride(1), 4); + EXPECT_FALSE(tensor.is_contiguous()); + EXPECT_TRUE(tensor.is_cuda()); + + DeviceTraits::free(cuda_data); +} + +TEST(FromBlobCUDATest, WithStorageOffset) { + constexpr size_t kBufferSize = 20; + constexpr size_t kNbytes = kBufferSize * sizeof(float); + + float* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + // Initialize via CPU buffer + float* cpu_buffer = static_cast( + DeviceTraits::allocate(kNbytes)); + for (size_t i = 0; i < kBufferSize; ++i) { + cpu_buffer[i] = static_cast(i); + } + DeviceTraits::memcpy( + cuda_data, cpu_buffer, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + // Create tensor with offset of 5 elements + SlimTensor tensor = from_blob( + cuda_data, + {3, 4}, + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE, + 5); // storage_offset = 5 + + EXPECT_EQ(tensor.storage_offset(), 5); + EXPECT_EQ(tensor.numel(), 12); + EXPECT_TRUE(tensor.is_cuda()); + + // data_ptr() should point to cuda_data + 5 * sizeof(float) + EXPECT_EQ(tensor.data_ptr(), static_cast(cuda_data + 5)); + + // Verify first element is cuda_data[5] by copying back + float first_elem = 0.0f; + DeviceTraits::memcpy( + &first_elem, + cuda_data + 5, + sizeof(float), + CPU_DEVICE, + DEFAULT_CUDA_DEVICE); + EXPECT_FLOAT_EQ(first_elem, 5.0f); + + // Clean up + DeviceTraits::free(cpu_buffer); + DeviceTraits::free(cuda_data); +} + +// ============================================================================= +// from_blob CUDA with Different DTypes Tests +// ============================================================================= + +TEST(FromBlobCUDATest, Int64Dtype) { + constexpr size_t kNumElements = 6; + constexpr size_t kNbytes = kNumElements * sizeof(int64_t); + + int64_t* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + int64_t cpu_buffer[kNumElements] = {10, 20, 30, 40, 50, 60}; + DeviceTraits::memcpy( + cuda_data, cpu_buffer, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + SlimTensor tensor = + from_blob(cuda_data, {2, 3}, c10::ScalarType::Long, DEFAULT_CUDA_DEVICE); + + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Long); + EXPECT_EQ(tensor.itemsize(), sizeof(int64_t)); + EXPECT_EQ(tensor.numel(), kNumElements); + EXPECT_TRUE(tensor.is_cuda()); + + // Verify by copying back + int64_t verify_buffer[kNumElements]; + DeviceTraits::memcpy( + verify_buffer, cuda_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + EXPECT_EQ(verify_buffer[0], 10); + EXPECT_EQ(verify_buffer[5], 60); + + DeviceTraits::free(cuda_data); +} + +TEST(FromBlobCUDATest, Int8Dtype) { + constexpr size_t kNumElements = 10; + constexpr size_t kNbytes = kNumElements * sizeof(int8_t); + + int8_t* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + int8_t cpu_buffer[kNumElements]; + for (size_t i = 0; i < kNumElements; ++i) { + cpu_buffer[i] = static_cast(i); + } + DeviceTraits::memcpy( + cuda_data, cpu_buffer, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + SlimTensor tensor = + from_blob(cuda_data, {10}, c10::ScalarType::Char, DEFAULT_CUDA_DEVICE); + + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Char); + EXPECT_EQ(tensor.itemsize(), sizeof(int8_t)); + EXPECT_EQ(tensor.dim(), 1); + EXPECT_TRUE(tensor.is_cuda()); + + // Verify by copying back + int8_t verify_buffer[kNumElements]; + DeviceTraits::memcpy( + verify_buffer, cuda_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumElements; ++i) { + EXPECT_EQ(verify_buffer[i], static_cast(i)); + } + + DeviceTraits::free(cuda_data); +} + +// ============================================================================= +// from_blob CUDA Cross-Device Copy Tests +// ============================================================================= + +TEST(FromBlobCUDATest, CopyCPUFromBlobToCUDAFromBlob) { + constexpr size_t kNumFloats = 6; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Create CPU source with from_blob + float cpu_src_data[kNumFloats]; + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_src_data[i] = static_cast(i) * 3.0f; + } + SlimTensor cpu_src = + from_blob(cpu_src_data, {2, 3}, c10::ScalarType::Float, CPU_DEVICE); + + // Create CUDA destination with from_blob + float* cuda_dst_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + SlimTensor cuda_dst = from_blob( + cuda_dst_data, {2, 3}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Copy CPU -> CUDA + cuda_dst.copy_(cpu_src); + + // Verify by copying back to CPU + float verify_buffer[kNumFloats]; + DeviceTraits::memcpy( + verify_buffer, cuda_dst_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 3.0f); + } + + DeviceTraits::free(cuda_dst_data); +} + +TEST(FromBlobCUDATest, CopyCUDAFromBlobToCPUFromBlob) { + constexpr size_t kNumFloats = 4; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Create and initialize CUDA source with from_blob + float* cuda_src_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + float cpu_init[kNumFloats]; + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_init[i] = static_cast(i) + 100.0f; + } + DeviceTraits::memcpy( + cuda_src_data, cpu_init, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + SlimTensor cuda_src = from_blob( + cuda_src_data, {2, 2}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Create CPU destination with from_blob + float cpu_dst_data[kNumFloats] = {0.0f, 0.0f, 0.0f, 0.0f}; + SlimTensor cpu_dst = + from_blob(cpu_dst_data, {2, 2}, c10::ScalarType::Float, CPU_DEVICE); + + // Copy CUDA -> CPU + cpu_dst.copy_(cuda_src); + + // Verify CPU destination data + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(cpu_dst_data[i], static_cast(i) + 100.0f); + } + + DeviceTraits::free(cuda_src_data); +} + +TEST(FromBlobCUDATest, CopyCUDAFromBlobToCUDAFromBlob) { + constexpr size_t kNumFloats = 4; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Create and initialize CUDA source with from_blob + float* cuda_src_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + float cpu_init[kNumFloats]; + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_init[i] = static_cast(i) * 5.0f; + } + DeviceTraits::memcpy( + cuda_src_data, cpu_init, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + SlimTensor cuda_src = from_blob( + cuda_src_data, {2, 2}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Create CUDA destination with from_blob + float* cuda_dst_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + SlimTensor cuda_dst = from_blob( + cuda_dst_data, {2, 2}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Copy CUDA -> CUDA + cuda_dst.copy_(cuda_src); + + // Verify by copying back to CPU + float verify_buffer[kNumFloats]; + DeviceTraits::memcpy( + verify_buffer, cuda_dst_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 5.0f); + } + + DeviceTraits::free(cuda_src_data); + DeviceTraits::free(cuda_dst_data); +} + +// ============================================================================= +// from_blob CUDA to empty() Copy Tests +// ============================================================================= + +TEST(FromBlobCUDATest, CopyCUDAFromBlobToOwnedCUDATensor) { + constexpr size_t kNumFloats = 12; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Create CUDA source with from_blob + float* cuda_src_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + float cpu_init[kNumFloats]; + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_init[i] = static_cast(i) * 7.0f; + } + DeviceTraits::memcpy( + cuda_src_data, cpu_init, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + SlimTensor cuda_src = from_blob( + cuda_src_data, {3, 4}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Create owned CUDA destination with empty() + SlimTensor cuda_dst = + empty({3, 4}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + cuda_dst.copy_(cuda_src); + + // Verify by copying back to CPU + float verify_buffer[kNumFloats]; + DeviceTraits::memcpy( + verify_buffer, + cuda_dst.data_ptr(), + kNbytes, + CPU_DEVICE, + DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 7.0f); + } + + DeviceTraits::free(cuda_src_data); +} + +TEST(FromBlobCUDATest, CopyOwnedCUDATensorToCUDAFromBlob) { + constexpr size_t kNumFloats = 6; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Create owned CUDA source with empty() and initialize via CPU + SlimTensor cuda_src = + empty({2, 3}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + float cpu_init[kNumFloats]; + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_init[i] = static_cast(i) * 11.0f; + } + DeviceTraits::memcpy( + cuda_src.data_ptr(), cpu_init, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + // Create CUDA destination with from_blob + float* cuda_dst_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + SlimTensor cuda_dst = from_blob( + cuda_dst_data, {2, 3}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + cuda_dst.copy_(cuda_src); + + // Verify by copying back to CPU + float verify_buffer[kNumFloats]; + DeviceTraits::memcpy( + verify_buffer, cuda_dst_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 11.0f); + } + + DeviceTraits::free(cuda_dst_data); +} + +// ============================================================================= +// from_blob CUDA Shared Storage Tests +// ============================================================================= + +TEST(FromBlobCUDATest, CopiedTensorSharesStorage) { + constexpr size_t kNumFloats = 8; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + float* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + SlimTensor tensor1 = + from_blob(cuda_data, {2, 4}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + SlimTensor tensor2 = tensor1; // Copy constructor + + // Both should point to same storage + EXPECT_EQ(tensor1.data_ptr(), tensor2.data_ptr()); + EXPECT_EQ(tensor1.storage().get(), tensor2.storage().get()); + EXPECT_TRUE(tensor1.is_cuda()); + EXPECT_TRUE(tensor2.is_cuda()); + + DeviceTraits::free(cuda_data); +} + +#endif // CUDA_AVAILABLE + +} // namespace executorch::backends::aoti::slim