Update on "gemma3 e2e runner on cuda"

Gasoonjia · Gasoonjia · commit 91d322e756cd · 2025-10-20T17:14:31.000-07:00
This diff introduces e2e runner for gemma3 model on cuda delegating using AOTI library, which is guarded by CI. Also other necessary infrastructure updates for building and running the `gemma3 e2e runner` on CUDA devices. Differential Revision: [D85087532](https://our.internmc.facebook.com/intern/diff/D85087532/) [ghstack-poisoned]
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
@@ -34,6 +34,40 @@ find_package(CUDAToolkit REQUIRED)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
+# CUDA tensor maker for backends that support incontiguous tensors
+set(_tensor_maker_sources runtime/tensor/tensor_maker.cpp)
+add_library(cuda_tensor_maker STATIC ${_tensor_maker_sources})
+target_include_directories(
+  cuda_tensor_maker
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+         $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+)
+target_compile_options(
+  cuda_tensor_maker
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
+# Ensure symbols are exported properly
+if(APPLE)
+  target_link_options(cuda_tensor_maker PUBLIC -Wl,-export_dynamic)
+else()
+  target_link_options(
+    cuda_tensor_maker PUBLIC
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+  )
+endif()
+
+# Link against ExecuTorch core libraries
+target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS})
+executorch_target_link_options_shared_lib(cuda_tensor_maker)
+
+install(
+  TARGETS cuda_tensor_maker
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
+
 # CUDA-specific AOTI functionality
 set(_aoti_cuda_sources
     runtime/cuda_backend.cpp
@@ -62,9 +96,9 @@ target_link_options(
   aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
-# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
+# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch CUDA libraries
 target_link_libraries(
-  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda PUBLIC aoti_common cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -27,6 +27,25 @@ runtime.cxx_library(
     ],
 )
 
+runtime.cxx_library(
+    name = "tensor_maker",
+    srcs = [
+        "tensor/tensor_maker.cpp",
+    ],
+    headers = [
+        "tensor/tensor_maker.h",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    deps = [
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/core/exec_aten:lib",
+        "//executorch/runtime/core/exec_aten/util:tensor_util",
+    ],
+)
+
 runtime.cxx_library(
     name = "runtime_shims",
     srcs = [
@@ -52,8 +71,8 @@ runtime.cxx_library(
     compiler_flags = ["-Wno-global-constructors"],
     visibility = ["@EXECUTORCH_CLIENTS"],
     deps = [
+        ":tensor_maker",
         "//executorch/backends/aoti:common_shims",
-        "//executorch/extension/tensor:tensor",
         "//executorch/runtime/core:core",
         "//executorch/runtime/core/exec_aten:lib",
         "//executorch/runtime/platform:platform",
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
@@ -11,6 +11,7 @@
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/tensor/tensor_maker.h>
 #include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
@@ -163,9 +164,11 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
 
   // Create ExecutorTorch tensor that wraps the existing memory
   // Note: We're NOT copying the data, just wrapping it
-  auto tensor = executorch::extension::from_blob(
-      data, // existing memory (don't copy!)
+  // Using CUDA-specific tensor maker that supports incontiguous tensors
+  auto tensor = executorch::backends::cuda::make_tensor(
       sizes, // tensor dimensions
+      data, // existing memory (don't copy!)
+      {}, // dim_order (empty, will be auto-generated)
       strides, // tensor strides (allows different strides)
       dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType
   );
@@ -268,8 +271,13 @@ AOTITorchError aoti_torch_empty_strided(
   auto strides = convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
 
   // ETensor creation with dynamic shape support for edge cases
-  auto tensor = executorch::extension::from_blob(
-      ptr, sizes, strides, dtype_to_scalar_type(dtype));
+  // Using CUDA-specific tensor maker that supports incontiguous tensors
+  auto tensor = executorch::backends::cuda::make_tensor(
+      sizes,
+      ptr,
+      {}, // dim_order (empty, will be auto-generated)
+      strides,
+      dtype_to_scalar_type(dtype));
 
   // Store the tensor so it doesn't get destroyed
   tensors.insert(tensor);
@@ -647,9 +655,11 @@ AOTITorchError aoti_torch__reinterpret_tensor(
 
   // Create new tensor view that reinterprets the same memory with different
   // shape/strides This creates a view, not a copy - the data pointer is shared
-  std::shared_ptr<Tensor> tensor = executorch::extension::from_blob(
-      data_ptr, // Reuse the same memory from source tensor
+  // Using CUDA-specific tensor maker that supports incontiguous tensors
+  std::shared_ptr<Tensor> tensor = executorch::backends::cuda::make_tensor(
       sizes, // New sizes with explicit SizesType
+      data_ptr, // Reuse the same memory from source tensor
+      {}, // dim_order (empty, will be auto-generated)
       strides, // New strides with explicit StridesType
       dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting
   );
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <cstdint>
 
 namespace executorch::backends::cuda {
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
@@ -278,30 +278,6 @@ TEST_F(AOTITorchEmptyStridedTest, LargeTensor) {
   EXPECT_EQ(tensor->size(2), 50);
 }
 
-// Test error handling with memory allocation failures
-TEST_F(AOTITorchEmptyStridedTest, MemoryAllocationStress) {
-  // Try to create a very large tensor that might cause allocation failure
-  // (This test may pass or fail depending on available memory)
-  std::vector<int64_t> huge_sizes = {10000, 10000, 100}; // ~38GB for float32
-  Tensor* tensor;
-
-  AOTITorchError error = aoti_torch_empty_strided(
-      huge_sizes.size(),
-      huge_sizes.data(),
-      nullptr,
-      6, // float32
-      1, // CUDA device
-      0, // device index
-      &tensor);
-
-  // Either succeed or fail with memory allocation error
-  if (error == Error::Ok) {
-    EXPECT_NE(tensor, nullptr);
-  } else {
-    EXPECT_EQ(error, Error::MemoryAllocationFailed);
-  }
-}
-
 // Test aoti_torch_empty_strided with bfloat16 dtype
 TEST_F(AOTITorchEmptyStridedTest, BFloat16Tensor) {
   // Test creating bfloat16 tensor on CUDA
diff --git a/backends/cuda/runtime/tensor/tensor_maker.cpp b/backends/cuda/runtime/tensor/tensor_maker.cpp
@@ -0,0 +1,120 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <executorch/backends/cuda/runtime/tensor/tensor_maker.h>
+
+#include <numeric>
+
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+namespace executorch::backends::cuda {
+
+namespace {
+#ifndef USE_ATEN_LIB
+/**
+ * A structure that consolidates the metadata (sizes, dim_order, strides) and
+ * the data buffer associated with a Tensor. Since Tensor does not own
+ * the memory for these metadata arrays or the data itself, this structure
+ * ensures that they are managed together and have the same lifetime as the
+ * Tensor. When the Tensor is destroyed, the Storage structure ensures
+ * proper cleanup of the associated metadata and data if needed.
+ */
+struct Storage final {
+  executorch::aten::TensorImpl tensor_impl;
+  executorch::aten::Tensor tensor;
+  std::vector<executorch::aten::SizesType> sizes;
+  std::vector<executorch::aten::DimOrderType> dim_order;
+  std::vector<executorch::aten::StridesType> strides;
+  std::function<void(void*)> deleter;
+
+  Storage(
+      executorch::aten::TensorImpl&& tensor_impl,
+      std::vector<executorch::aten::SizesType>&& sizes,
+      std::vector<executorch::aten::DimOrderType>&& dim_order,
+      std::vector<executorch::aten::StridesType>&& strides,
+      std::function<void(void*)>&& deleter)
+      : tensor_impl(std::move(tensor_impl)),
+        tensor(&this->tensor_impl),
+        sizes(std::move(sizes)),
+        dim_order(std::move(dim_order)),
+        strides(std::move(strides)),
+        deleter(std::move(deleter)) {}
+
+  ~Storage() {
+    if (deleter) {
+      deleter(tensor_impl.mutable_data());
+    }
+  }
+};
+#endif // USE_ATEN_LIB
+} // namespace
+
+TensorPtr make_tensor(
+    std::vector<executorch::aten::SizesType> sizes,
+    void* data,
+    std::vector<executorch::aten::DimOrderType> dim_order,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type,
+    executorch::aten::TensorShapeDynamism dynamism,
+    std::function<void(void*)> deleter) {
+  const auto dim = sizes.size();
+  ET_CHECK_MSG(
+      dim_order.empty() || dim_order.size() == dim,
+      "dim_order size must match sizes or be empty.");
+  ET_CHECK_MSG(
+      strides.empty() || strides.size() == dim,
+      "strides size must match sizes or be empty.");
+
+  if (dim_order.empty()) {
+    dim_order.resize(dim);
+    std::iota(dim_order.begin(), dim_order.end(), 0);
+    if (!strides.empty()) {
+      std::sort(dim_order.begin(), dim_order.end(), [&](size_t a, size_t b) {
+        return strides[a] > strides[b];
+      });
+    }
+  }
+
+  // AOTI backends (like AOTI-CUDA) handle both contiguous and incontiguous
+  // tensors, so we skip stride calculation and incontiguous tensor checks.
+  // Strides are passed through as-is without validation.
+
+#ifndef USE_ATEN_LIB
+  executorch::aten::TensorImpl tensor_impl(
+      type,
+      dim,
+      sizes.data(),
+      data,
+      dim_order.data(),
+      strides.data(),
+      dim > 0 ? dynamism : executorch::aten::TensorShapeDynamism::STATIC);
+  auto storage = std::make_shared<Storage>(
+      std::move(tensor_impl),
+      std::move(sizes),
+      std::move(dim_order),
+      std::move(strides),
+      std::move(deleter));
+  const auto tensor_ptr = &storage->tensor;
+  return std::shared_ptr<executorch::aten::Tensor>(
+      std::move(storage), tensor_ptr);
+#else
+  auto options = c10::TensorOptions()
+                     .dtype(c10::scalarTypeToTypeMeta(type))
+                     .device(c10::kCPU);
+  auto storage = c10::Storage(
+      c10::Storage::use_byte_size_t(),
+      at::detail::computeStorageNbytes(
+          sizes, strides, options.dtype().itemsize()),
+      c10::InefficientStdFunctionContext::makeDataPtr(
+          data, std::move(deleter), options.device()),
+      nullptr,
+      false);
+  auto tensor_impl = c10::make_intrusive<executorch::aten::TensorImpl>(
+      std::move(storage),
+      c10::DispatchKeySet(c10::DispatchKey::CPU),
+      options.dtype());
+  tensor_impl->set_sizes_and_strides(sizes, strides);
+  return std::make_shared<executorch::aten::Tensor>(std::move(tensor_impl));
+#endif // USE_ATEN_LIB
+}
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/tensor/tensor_maker.h b/backends/cuda/runtime/tensor/tensor_maker.h
@@ -0,0 +1,50 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+namespace executorch::backends::cuda {
+
+/**
+ * A smart pointer type for managing the lifecycle of a Tensor.
+ * This is compatible with executorch::extension::TensorPtr.
+ */
+using TensorPtr = std::shared_ptr<executorch::aten::Tensor>;
+
+/**
+ * Creates a TensorPtr for AOTI backends that skips stride calculation and
+ * incontiguous tensor checks. This is specifically designed for AOTI-CUDA
+ * which handles both contiguous and incontiguous tensors.
+ *
+ * This function is similar to executorch::extension::make_tensor_ptr but
+ * bypasses the stride validation that assumes contiguous tensors, making it
+ * suitable for AOTI backends that support arbitrary strides.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A pointer to the data buffer.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of the tensor.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * data buffer. If provided, this deleter will be called when the managed Tensor
+ * object is destroyed.
+ * @return A TensorPtr that manages the newly created Tensor.
+ */
+TensorPtr make_tensor(
+    std::vector<executorch::aten::SizesType> sizes,
+    void* data,
+    std::vector<executorch::aten::DimOrderType> dim_order,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    std::function<void(void*)> deleter = nullptr);
+
+} // namespace executorch::backends::cuda
diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp
@@ -83,17 +83,7 @@ TensorPtr make_tensor_ptr(
 // Skip stride calculation and incontiguous tensor check for CUDA backend since
 // AOTI-CUDA handles both contiguous and incontiguous tensors. This will be
 // removed after SlimTensor migration.
-#ifdef USE_CUDA_BACKEND
-  if (strides.empty()) {
-    std::vector<executorch::aten::StridesType> computed_strides(dim);
-
-    auto error = runtime::dim_order_to_stride(
-        sizes.data(), dim_order.data(), dim, computed_strides.data());
-    ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
-
-    strides = std::move(computed_strides);
-  }
-#else
+#ifndef USE_CUDA_BACKEND
   std::vector<executorch::aten::StridesType> computed_strides(dim);
 
   auto error = runtime::dim_order_to_stride(