diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS index 024418d31a6..ad5baa8d83f 100644 --- a/backends/cuda/runtime/TARGETS +++ b/backends/cuda/runtime/TARGETS @@ -109,6 +109,36 @@ runtime.cxx_library( ], ) +runtime.cxx_library( + name = "runtime_shims_slim", + srcs = [ + "shims/memory_slim.cpp", + ], + headers = [ + "shims/memory_slim.h", + ], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + visibility = ["@EXECUTORCH_CLIENTS"], + preprocessor_flags = ["-DCUDA_AVAILABLE=1"], + deps = [ + "//executorch/backends/aoti/slim/core:slimtensor", + "//executorch/backends/aoti/slim/factory:empty", + "//executorch/backends/aoti/slim/factory:from_blob", + "//executorch/backends/aoti:common_shims", + "//executorch/runtime/core:core", + "//executorch/runtime/platform:platform", + ], + nvcc_flags = get_nvcc_arch_args() + [ + "-_NVCC_HOST_COMPILER_FLAG_", + "gcc", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], +) + runtime.cxx_library( name = "cuda_backend", srcs = [ diff --git a/backends/cuda/runtime/shims/memory_slim.cpp b/backends/cuda/runtime/shims/memory_slim.cpp new file mode 100644 index 00000000000..500cd41308e --- /dev/null +++ b/backends/cuda/runtime/shims/memory_slim.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include + +namespace executorch::backends::cuda { + +namespace c10 = executorch::backends::aoti::slim::c10; +using c10::Device; +using c10::DeviceIndex; +using c10::DeviceType; +using c10::ScalarType; +using executorch::backends::aoti::slim::empty_strided; +using executorch::backends::aoti::slim::from_blob; +using executorch::backends::aoti::slim::IntArrayRef; + +extern "C" { + +AOTITorchError aoti_torch_create_tensor_from_blob_v2( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor, + int32_t layout, + const uint8_t* opaque_metadata, + int64_t opaque_metadata_size) { + // Unused parameters + (void)layout; + (void)opaque_metadata; + (void)opaque_metadata_size; + + ET_CHECK_OR_RETURN_ERROR( + data != nullptr, + InvalidArgument, + "aoti_torch_create_tensor_from_blob_v2: data is null"); + + ET_CHECK_OR_RETURN_ERROR( + ret_new_tensor != nullptr, + InvalidArgument, + "aoti_torch_create_tensor_from_blob_v2: ret_new_tensor is null"); + + ET_CHECK_OR_RETURN_ERROR( + !(sizes_ptr == nullptr && ndim > 0), + InvalidArgument, + "aoti_torch_create_tensor_from_blob_v2: sizes_ptr is null but ndim > 0"); + + IntArrayRef sizes(sizes_ptr, static_cast(ndim)); + IntArrayRef strides(strides_ptr, static_cast(ndim)); + + // Create the SlimTensor using from_blob (non-owning) + *ret_new_tensor = new Tensor(from_blob( + data, + sizes, + strides, + static_cast(dtype), + Device( + static_cast(device_type), + static_cast(device_index)), + storage_offset)); + + return Error::Ok; +} + +} // extern "C" + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/memory_slim.h b/backends/cuda/runtime/shims/memory_slim.h new file mode 100644 index 00000000000..7650c4de4b6 --- /dev/null +++ b/backends/cuda/runtime/shims/memory_slim.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace executorch::backends::cuda { + +using executorch::runtime::Error; +using AOTITorchError = Error; +using Tensor = executorch::backends::aoti::slim::SlimTensor; + +extern "C" { + +/** + * Creates a tensor object from an existing memory blob without copying the + * data. The tensor will wrap the provided memory and will not take ownership of + * it. When the tensor is deleted, the original memory will remain valid and + * must be freed by the caller. + * + * @param data Pointer to the memory blob to wrap (must not be null) + * @param ndim Number of dimensions in the tensor + * @param sizes_ptr Pointer to array of dimension sizes + * @param strides_ptr Pointer to array of strides for each dimension + * @param storage_offset Storage offset in number of elements + * @param dtype Data type identifier (matches PyTorch scalar types) + * @param device_type Device type (CPU=0, CUDA=1) + * @param device_index Device index + * @param ret_new_tensor Output parameter for the created tensor + * @param layout Tensor layout identifier (0=strided) + * @param opaque_metadata Optional metadata pointer (can be null) + * @param opaque_metadata_size Size of opaque metadata in bytes + * @return AOTITorchError error code (Error::Ok on success) + */ +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor, + int32_t layout, + const uint8_t* opaque_metadata, + int64_t opaque_metadata_size); + +} // extern "C" + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl index 256b375f20d..78f8dea20ce 100644 --- a/backends/cuda/runtime/shims/tests/targets.bzl +++ b/backends/cuda/runtime/shims/tests/targets.bzl @@ -25,12 +25,40 @@ def cuda_shim_cpp_unittest(name): ), ) +def cuda_shim_slim_cpp_unittest(name): + """Unittest for SlimTensor-based shim functions.""" + cpp_unittest( + name = "test_" + name + "_slim", + srcs = [ + "test_" + name + "_slim.cpp", + ], + deps = [ + "//executorch/backends/cuda/runtime:runtime_shims_slim", + "//executorch/backends/aoti:common_shims", + "//executorch/backends/aoti/slim/core:slimtensor", + "//executorch/backends/aoti/slim/factory:empty", + "//executorch/backends/aoti/slim/factory:from_blob", + "//executorch/runtime/core:core", + "//executorch/runtime/platform:platform", + ], + + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], + preprocessor_flags = ["-DCUDA_AVAILABLE=1"], + keep_gpu_sections = True, + remote_execution = re_test_utils.remote_execution( + platform = "gpu-remote-execution", + ), + ) + def define_common_targets(): """Defines targets that should be shared between fbcode and xplat. The directory containing this targets.bzl file should also contain both TARGETS and BUCK files that call this function. """ + # Original ETensor-based shim tests, will be removed after migration cuda_shim_cpp_unittest("aoti_torch_empty_strided") cuda_shim_cpp_unittest("aoti_torch_delete_tensor_object") cuda_shim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2") @@ -41,3 +69,6 @@ def define_common_targets(): cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle") cuda_shim_cpp_unittest("aoti_torch_item_bool") cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out") + + # SlimTensor-based shim tests + cuda_shim_slim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2") diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp new file mode 100644 index 00000000000..21f8c79cc46 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp @@ -0,0 +1,633 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace executorch::backends::cuda; +using executorch::runtime::Error; + +namespace slim_c10 = executorch::backends::aoti::slim::c10; + +namespace { + +// Helper to check if CUDA is available +bool isCudaAvailable() { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + return (err == cudaSuccess && device_count > 0); +} + +// Helper to calculate contiguous strides from sizes +std::vector calculateContiguousStrides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; + } + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + return strides; +} + +// Helper to calculate numel from sizes +int64_t calculateNumel(const std::vector& sizes) { + int64_t numel = 1; + for (int64_t size : sizes) { + numel *= size; + } + return numel; +} + +} // namespace + +// Test fixture for SlimTensor-based aoti_torch_create_tensor_from_blob_v2 tests +class AOTITorchCreateTensorFromBlobV2SlimTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); + } + + void TearDown() override { + // Clean up tensors + for (Tensor* t : tensors_) { + delete t; + } + tensors_.clear(); + + // Clean up CUDA memory + for (void* ptr : cuda_memory_) { + if (ptr != nullptr) { + cudaFree(ptr); + } + } + cuda_memory_.clear(); + + // Clean up CPU memory + for (void* ptr : cpu_memory_) { + if (ptr != nullptr) { + free(ptr); + } + } + cpu_memory_.clear(); + } + + void* allocateCudaMemory(size_t bytes) { + void* ptr = nullptr; + cudaError_t err = cudaMalloc(&ptr, bytes); + if (err == cudaSuccess && ptr != nullptr) { + cuda_memory_.push_back(ptr); + } + return ptr; + } + + void* allocateCpuMemory(size_t bytes) { + void* ptr = nullptr; + int result = posix_memalign(&ptr, 16, bytes); + if (result == 0 && ptr != nullptr) { + cpu_memory_.push_back(ptr); + } + return ptr; + } + + void trackTensor(Tensor* t) { + if (t != nullptr) { + tensors_.push_back(t); + } + } + + private: + std::vector tensors_; + std::vector cuda_memory_; + std::vector cpu_memory_; +}; + +// ============================================================================ +// Common test body - parameterized by device type +// ============================================================================ + +void runBasicFromBlobTest( + AOTITorchCreateTensorFromBlobV2SlimTest* fixture, + void* data, + int32_t device_type, + int32_t device_index) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); + EXPECT_EQ(tensor->numel(), 6); + EXPECT_EQ( + static_cast(tensor->dtype()), + static_cast(slim_c10::ScalarType::Float)); + + // Verify the tensor uses the same data pointer (non-owning) + EXPECT_EQ(tensor->data_ptr(), data); + + // Cleanup - tensor should NOT free the original memory + delete tensor; +} + +void runScalarFromBlobTest( + AOTITorchCreateTensorFromBlobV2SlimTest* fixture, + void* data, + int32_t device_type, + int32_t device_index) { + std::vector sizes = {}; // 0D tensor + std::vector strides = {}; + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->numel(), 1); + EXPECT_EQ(tensor->data_ptr(), data); + + delete tensor; +} + +void runMultiDimensionalFromBlobTest( + AOTITorchCreateTensorFromBlobV2SlimTest* fixture, + void* data, + int32_t device_type, + int32_t device_index) { + std::vector sizes = {2, 3, 4}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + EXPECT_EQ(tensor->dim(), 3); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); + EXPECT_EQ(tensor->size(2), 4); + EXPECT_EQ(tensor->numel(), 24); + EXPECT_EQ(tensor->data_ptr(), data); + + delete tensor; +} + +void runCustomStridesFromBlobTest( + AOTITorchCreateTensorFromBlobV2SlimTest* fixture, + void* data, + int32_t device_type, + int32_t device_index) { + std::vector sizes = {3, 4}; + std::vector strides = {1, 3}; // Column-major + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + EXPECT_EQ(tensor->stride(0), 1); + EXPECT_EQ(tensor->stride(1), 3); + EXPECT_FALSE(tensor->is_contiguous()); + EXPECT_EQ(tensor->data_ptr(), data); + + delete tensor; +} + +void runStorageOffsetFromBlobTest( + AOTITorchCreateTensorFromBlobV2SlimTest* fixture, + void* data, + int32_t device_type, + int32_t device_index) { + std::vector sizes = {2, 2}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 2, // storage_offset = 2 elements + static_cast(slim_c10::ScalarType::Float), + device_type, + device_index, + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + EXPECT_EQ(tensor->storage_offset(), 2); + // data_ptr should point to base + offset * itemsize + char* expected_ptr = static_cast(data) + 2 * sizeof(float); + EXPECT_EQ(tensor->data_ptr(), expected_ptr); + + delete tensor; +} + +// ============================================================================ +// CPU Tests +// ============================================================================ + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CPU) { + size_t bytes = 6 * sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); + + runBasicFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CPU), 0); +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CPU) { + size_t bytes = sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); + + runScalarFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CPU), 0); +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CPU) { + size_t bytes = 24 * sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); + + runMultiDimensionalFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CPU), 0); +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CPU) { + size_t bytes = 12 * sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); + + runCustomStridesFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CPU), 0); +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CPU) { + // Allocate extra space for offset + size_t bytes = 6 * sizeof(float); // 2 for offset + 4 for tensor + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); + + runStorageOffsetFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CPU), 0); +} + +// ============================================================================ +// CUDA Tests +// ============================================================================ + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + size_t bytes = 6 * sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); + + runBasicFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CUDA), 0); +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + size_t bytes = sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); + + runScalarFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CUDA), 0); +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + size_t bytes = 24 * sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); + + runMultiDimensionalFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CUDA), 0); +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + size_t bytes = 12 * sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); + + runCustomStridesFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CUDA), 0); +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + // Allocate extra space for offset + size_t bytes = 6 * sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); + + runStorageOffsetFromBlobTest( + this, data, static_cast(slim_c10::DeviceType::CUDA), 0); +} + +// ============================================================================ +// Verify Non-Owning Behavior +// ============================================================================ + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CPU) { + size_t bytes = 6 * sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); + + // Write a pattern + float* float_data = static_cast(data); + float_data[0] = 42.0f; + + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0, + &tensor, + 0, + nullptr, + 0); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + // Delete tensor - memory should NOT be freed + delete tensor; + tensor = nullptr; + + // Memory should still be accessible + EXPECT_FLOAT_EQ(float_data[0], 42.0f); +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CUDA) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + size_t bytes = 6 * sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); + + // Write a pattern + float pattern = 42.0f; + cudaMemcpy(data, &pattern, sizeof(float), cudaMemcpyHostToDevice); + + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0, + &tensor, + 0, + nullptr, + 0); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + // Delete tensor - memory should NOT be freed + delete tensor; + tensor = nullptr; + + // Memory should still be accessible + float readback = 0.0f; + cudaError_t cuda_err = + cudaMemcpy(&readback, data, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess); + EXPECT_FLOAT_EQ(readback, 42.0f); +} + +// ============================================================================ +// Error Cases +// ============================================================================ + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullDataPointer) { + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + nullptr, // null data + sizes.size(), + sizes.data(), + strides.data(), + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0, + &tensor, + 0, + nullptr, + 0); + + EXPECT_EQ(error, Error::InvalidArgument); +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullReturnPointer) { + size_t bytes = 6 * sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); + + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0, + nullptr, // null return pointer + 0, + nullptr, + 0); + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// ============================================================================ +// Verify Device Properties +// ============================================================================ + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCPUDevice) { + size_t bytes = 6 * sizeof(float); + void* data = allocateCpuMemory(bytes); + ASSERT_NE(data, nullptr); + + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CPU), + 0, + &tensor, + 0, + nullptr, + 0); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + EXPECT_TRUE(tensor->is_cpu()); + EXPECT_FALSE(tensor->is_cuda()); + EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CPU); + + delete tensor; +} + +TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCUDADevice) { + if (!isCudaAvailable()) { + GTEST_SKIP() << "CUDA not available"; + } + + size_t bytes = 6 * sizeof(float); + void* data = allocateCudaMemory(bytes); + ASSERT_NE(data, nullptr); + + std::vector sizes = {2, 3}; + std::vector strides = calculateContiguousStrides(sizes); + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, + static_cast(slim_c10::ScalarType::Float), + static_cast(slim_c10::DeviceType::CUDA), + 0, + &tensor, + 0, + nullptr, + 0); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + EXPECT_FALSE(tensor->is_cpu()); + EXPECT_TRUE(tensor->is_cuda()); + EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CUDA); + + delete tensor; +}