tensor empty strided (#14549)

Gasoonjia · facebook-github-bot · commit da85148bb80b · 2025-09-24T19:49:40.000-07:00
Summary:

this diff introduce aoti_tensor_empty_strided to et cuda backend, which will be one of the main functions to create empty tensor using the given stride.

Differential Revision: D83094606
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -0,0 +1,32 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.cxx_library(
+    name = "runtime_shims",
+    srcs = [
+        "shims/memory.cpp",
+        "shims/tensor_attribute.cpp",
+    ],
+    headers = [
+        "shims/memory.h",
+        "shims/tensor_attribute.h",
+        "shims/utils.h",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    # Constructor needed for backend registration.
+    compiler_flags = ["-Wno-global-constructors"],
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    deps = [
+        "//executorch/backends/aoti:common_shims",
+        "//executorch/extension/tensor:tensor",
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/core/exec_aten:lib",
+        "//executorch/runtime/platform:platform",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/aoti/utils.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/runtime/platform/log.h>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib> // For posix_memalign
+#include <cstring>
+#include <memory>
+#include <unordered_set>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+// Global storage for tensors and their metadata
+std::unordered_set<std::shared_ptr<Tensor>> tensors;
+
+extern "C" {
+
+AOTITorchError aoti_torch_empty_strided(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor) {
+  // This requires us to reserve CUDA memory and put it into a ETensor
+  void* ptr;
+  int64_t numel = 1;
+  for (int i = 0; i < ndim; i++) {
+    numel *= sizes_ptr[i];
+  }
+
+  AOTITorchError dtype_error = validate_dtype(dtype);
+  if (dtype_error != Error::Ok) {
+    return dtype_error;
+  }
+
+  size_t element_size = dtype_to_element_size(dtype);
+  if (element_size == 0) {
+    ET_LOG(Error, "Invalid element size for dtype: %d", dtype);
+    return Error::InvalidArgument;
+  }
+  int64_t nbytes = numel * element_size;
+
+  if (device_type == 1) { // cuda
+    cudaError_t err = cudaMalloc(&ptr, nbytes);
+    if (err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "failed to allocate %ld bytes: %s",
+          nbytes,
+          cudaGetErrorString(err));
+      return Error::MemoryAllocationFailed;
+    }
+  } else if (device_type == 0) { // cpu
+    // Ensure 16-byte alignment for CPU memory to match CUDA requirements
+    // do we need to do this in cuda backend?
+    int result = posix_memalign(&ptr, 16, nbytes);
+    if (result != 0) {
+      ET_LOG(Error, "Failed to allocate aligned CPU memory");
+      return Error::MemoryAllocationFailed;
+    }
+    if (ptr == nullptr) {
+      ET_LOG(Error, "Failed to call posix_memalign");
+      return Error::MemoryAllocationFailed;
+    }
+  } else {
+    ET_LOG(
+        Error,
+        "Need to implement empty_strided for non-CUDA non-CPU device type %d",
+        device_type);
+    return Error::NotImplemented;
+  }
+
+  // ETensor sizes
+  std::vector<int32_t> sizes(ndim);
+  for (int i = 0; i < ndim; i++) {
+    sizes[i] = sizes_ptr[i];
+  }
+
+  // ETensor strides
+  std::vector<int32_t> strides(ndim);
+  if (strides_ptr != nullptr) {
+    // Use provided strides. it is ok if provided strides here is not contiguous
+    // strides since it will be used internally in CUDA delegate.
+    for (int i = 0; i < ndim; i++) {
+      strides[i] = strides_ptr[i];
+    }
+  } else {
+    // Calculate strides from sizes using ExecutorTorch's algorithm
+    if (ndim > 0) {
+      strides[ndim - 1] = 1; // Last dimension has stride 1
+      for (int i = ndim - 2; i >= 0; i--) {
+        if (sizes_ptr[i + 1] == 0) {
+          strides[i] = strides[i + 1]; // Copy stride when size is 0
+        } else {
+          strides[i] = strides[i + 1] * sizes_ptr[i + 1];
+        }
+      }
+    }
+  }
+
+  // ETensor creation with dynamic shape support for edge cases
+  auto tensor = executorch::extension::from_blob(
+      ptr, sizes, strides, dtype_to_scalar_type(dtype));
+
+  // Store the tensor so it doesn't get destroyed
+  tensors.insert(tensor);
+  *ret_new_tensor = tensor.get();
+
+  return Error::Ok;
+}
+
+// TODO(gasoonjia): reuse aoti_torch_delete_tensor_object to destory tensors
+void clear_all_tensors() {
+  tensors.clear();
+}
+
+} // extern "C"
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <cstdint>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+using namespace executorch::backends::aoti;
+
+extern "C" {
+
+AOTITorchError aoti_torch_empty_strided(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor);
+
+// Function to clear all tensors from internal storage
+// TODO(gasoonjia): reuse aoti_torch_delete_tensor_object to destory tensors
+void clear_all_tensors();
+
+} // extern "C"
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/runtime/shims/tests/TARGETS b/backends/cuda/runtime/shims/tests/TARGETS
@@ -0,0 +1,6 @@
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl
@@ -0,0 +1,30 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
+
+def cuda_shim_cpp_unittest(name):
+    cpp_unittest(
+        name = "test_" + name,
+        srcs = [
+            "test_" + name + ".cpp",
+        ],
+        deps = [
+            "//executorch/backends/aoti:common_shims",
+            "//executorch/backends/cuda/runtime:runtime_shims",
+            "//executorch/extension/tensor:tensor",
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/platform:platform",
+            "//executorch/runtime/core/exec_aten:lib",
+        ],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+    )
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    cuda_shim_cpp_unittest("aoti_torch_empty_strided")
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
diff --git a/backends/cuda/runtime/shims/utils.h b/backends/cuda/runtime/shims/utils.h