aoti_torch_create_tensor_from_blob_v2 (pytorch#14604)

Gasoonjia · facebook-github-bot · commit f4cc1b37754a · 2025-09-25T12:51:49.000-07:00
Summary: Pull Request resolved: pytorch#14604 This function introduce aoti_torch_create_tensor_from_blob_v2, a function that create tensor from data blob and custom stride and size. Worth to notice that unlike aoti_torch_empty_strided, the tensor created by aoti_torch_create_tensor_from_blob_v2 will not have the control of the memory blob. Therefore when we delete it, the memory will not be freed. Differential Revision: D83094602
diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
@@ -73,6 +73,23 @@ inline AOTITorchError validate_storage_offset(int64_t storage_offset) {
   return Error::Ok;
 }
 
+// Check if tensor is in contiguous memory format (NCHW for 4D tensors)
+// Contiguous format means strides decrease from left to right:
+// For NCHW: strides = [C*H*W, H*W, W, 1]
+inline bool is_tensor_contiguous(
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides) {
+  int64_t expected_stride = 1;
+  for (int64_t i = ndim - 1; i >= 0; i--) {
+    if (strides[i] != expected_stride) {
+      return false;
+    }
+    expected_stride *= sizes[i];
+  }
+  return true;
+}
+
 } // extern "C"
 
 } // namespace aoti
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
@@ -15,29 +15,10 @@
 #include <cstdint>
 #include <cstdlib> // For posix_memalign
 #include <memory>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
-// CUDA error checking macro
-#define ET_CUDA_CHECK_OR_RETURN_ERROR(EXPR) \
-  do {                                      \
-    const cudaError_t err = EXPR;           \
-    if (err == cudaSuccess) {               \
-      break;                                \
-    }                                       \
-    ET_LOG(                                 \
-        Error,                              \
-        "%s:%d CUDA error: %s",             \
-        __FILE__,                           \
-        __LINE__,                           \
-        cudaGetErrorString(err));           \
-    return Error::Internal;                 \
-  } while (0)
-
-// Kernel launch check macro
-#define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \
-  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetLastError())
-
 namespace executorch {
 namespace backends {
 namespace cuda {
@@ -46,12 +27,105 @@ using executorch::aten::SizesType;
 using executorch::aten::StridesType;
 using executorch::backends::aoti::dtype_to_element_size;
 using executorch::backends::aoti::dtype_to_scalar_type;
+using executorch::backends::aoti::validate_storage_offset;
 
 // Global storage for tensors and their metadata
 std::unordered_set<std::shared_ptr<Tensor>> tensors;
+// Global storage for tensor ownership information
+std::unordered_map<Tensor*, bool> is_tensor_own_memory;
 
 extern "C" {
 
+AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor,
+    int32_t layout,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size) {
+  // TODO(gasoonjia): verify given data is on the target device
+  (void)device_type;
+  (void)opaque_metadata;
+  (void)layout;
+  (void)opaque_metadata_size;
+
+  // Validate input parameters first
+  if (data == nullptr) {
+    ET_LOG(
+        Error,
+        "aoti_torch_create_tensor_from_blob_v2 failed: data pointer is null");
+    return Error::InvalidArgument;
+  }
+
+  if (sizes_ptr == nullptr && ndim > 0) {
+    ET_LOG(
+        Error,
+        "aoti_torch_create_tensor_from_blob_v2 failed: sizes_ptr is null");
+    return Error::InvalidArgument;
+  }
+
+  if (ret_new_tensor == nullptr) {
+    ET_LOG(
+        Error,
+        "aoti_torch_create_tensor_from_blob_v2 failed: ret_new_tensor is null");
+    return Error::InvalidArgument;
+  }
+
+  // Check that device_index is always 0
+  if (device_index != 0) {
+    ET_LOG(Error, "device_index must be 0, got: %d", device_index);
+    return Error::InvalidArgument;
+  }
+
+  // Validate dtype using SupportedDTypes from utils.h
+  AOTITorchError dtype_error = validate_dtype(dtype);
+  if (dtype_error != Error::Ok) {
+    return dtype_error;
+  }
+
+  // Storage offset must be 0 since from_blob cannot handle different offsets
+  AOTITorchError storage_offset_error = validate_storage_offset(storage_offset);
+  if (storage_offset_error != Error::Ok) {
+    return storage_offset_error;
+  }
+
+  // Convert sizes to the format expected by ExecutorTorch using SizesType
+  std::vector<executorch::aten::SizesType> sizes =
+      convert_sizes_to_vector(ndim, sizes_ptr);
+
+  // Convert strides using the common helper function with StridesType
+  std::vector<executorch::aten::StridesType> strides =
+      convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
+
+  // Create ExecutorTorch tensor that wraps the existing memory
+  // Note: We're NOT copying the data, just wrapping it
+  auto tensor = executorch::extension::from_blob(
+      data, // existing memory (don't copy!)
+      sizes, // tensor dimensions
+      strides, // tensor strides (allows different strides)
+      dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType
+  );
+
+  if (!tensor) {
+    ET_LOG(Error, "Failed to create tensor from blob");
+    return Error::InvalidArgument;
+  }
+
+  // Store the tensor so it doesn't get destroyed
+  tensors.insert(tensor);
+
+  *ret_new_tensor = tensor.get();
+  is_tensor_own_memory[tensor.get()] = false;
+
+  return Error::Ok;
+}
+
 AOTITorchError aoti_torch_empty_strided(
     int64_t ndim,
     const int64_t* sizes_ptr,
@@ -119,6 +193,7 @@ AOTITorchError aoti_torch_empty_strided(
   // Store the tensor so it doesn't get destroyed
   tensors.insert(tensor);
   *ret_new_tensor = tensor.get();
+  is_tensor_own_memory[tensor.get()] = true;
 
   return Error::Ok;
 }
@@ -156,9 +231,32 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
   // If tensor not found in our tracking, it's invalid
   if (!found_in_tensors) {
     ET_LOG(Error, "Didn't find tensor %p", tensor);
+    // Clean up any stale ownership info
+    is_tensor_own_memory.erase(tensor);
     return Error::InvalidArgument;
   }
 
+  // Check ownership before cleaning up metadata
+  auto ownership_it = is_tensor_own_memory.find(tensor);
+  bool owns_memory = (ownership_it != is_tensor_own_memory.end())
+      ? ownership_it->second
+      : false;
+
+  // Clean up local metadata maps immediately to prevent use-after-free
+  is_tensor_own_memory.erase(tensor);
+
+  if (!owns_memory) {
+    // Don't free memory since the tensor doesn't own it, but still remove from
+    // tracking
+    for (auto it = tensors.begin(); it != tensors.end(); ++it) {
+      if (it->get() == tensor) {
+        tensors.erase(it);
+        break;
+      }
+    }
+    return Error::Ok;
+  }
+
   // Find and delete the tensor
   for (auto it = tensors.begin(); it != tensors.end(); ++it) {
     if (it->get() == tensor) {
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
@@ -21,6 +21,44 @@ using executorch::backends::aoti::Tensor;
 
 extern "C" {
 
+/**
+ * Creates a tensor object from an existing memory blob without copying the
+ * data. The tensor will wrap the provided memory and will not take ownership of
+ * it. When the tensor is deleted, the original memory will remain valid and
+ * must be freed by the caller.
+ *
+ * @param data Pointer to the memory blob to wrap (must not be null)
+ * @param ndim Number of dimensions in the tensor
+ * @param sizes_ptr Pointer to array of dimension sizes (using SizesType)
+ * @param strides_ptr Pointer to array of strides for each dimension (using
+ * StridesType, can be null for contiguous)
+ * @param storage_offset Storage offset (must be 0 for current implementation)
+ * @param dtype Data type identifier (supports FLOAT32 and BFLOAT16 from
+ * SupportedDTypes)
+ * @param device_type Device type (CPU=0, CUDA=1 from SupportedDevices)
+ * @param device_index Device index (must be 0 for current implementation)
+ * @param ret_new_tensor Output parameter for the created tensor (must not be
+ * null)
+ * @param layout Tensor layout identifier (0=strided)
+ * @param opaque_metadata Optional metadata pointer (can be null)
+ * @param opaque_metadata_size Size of opaque metadata in bytes
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor,
+    int32_t layout,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size);
+
 /**
  * Creates an uninitialized tensor with specified dimensions, strides, and
  * dtyper on either CPU or CUDA device.
@@ -55,7 +93,6 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
 
 // Function to clear all tensors from internal storage
 void clear_all_tensors();
-
 } // extern "C"
 
 } // namespace cuda
diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl
@@ -29,3 +29,4 @@ def define_common_targets():
     """
     cuda_shim_cpp_unittest("aoti_torch_empty_strided")
     cuda_shim_cpp_unittest("aoti_torch_delete_tensor_object")
+    cuda_shim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2")
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
diff --git a/backends/cuda/runtime/shims/utils.h b/backends/cuda/runtime/shims/utils.h