trying something else

JacobSzwejbka · JacobSzwejbka · commit 53dc564cecd5 · 2025-10-18T17:37:38.000-07:00
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
@@ -46,15 +46,15 @@ set(_aoti_cuda_sources
 )
 # Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC
 if(MSVC)
-  add_library(aoti_cuda SHARED ${_aoti_cuda_sources} ${CMAKE_CURRENT_SOURCE_DIR}/aoti_cuda.def)
+  add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
   # Define export macros for Windows DLL
   target_compile_definitions(aoti_cuda PRIVATE
     EXPORT_AOTI_FUNCTIONS
     BUILDING_CUDA_BACKEND
   )
   # Ensure proper DLL import/export library naming on Windows with config-specific paths
   set_target_properties(aoti_cuda PROPERTIES
-    WINDOWS_EXPORT_ALL_SYMBOLS OFF  # We use explicit exports via AOTI_CUDA_EXPORT and .def file
+    WINDOWS_EXPORT_ALL_SYMBOLS OFF  # We use explicit exports via AOTI_CUDA_EXPORT
     RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$<CONFIG>
     LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
     ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
diff --git a/backends/cuda/aoti_cuda.def b/backends/cuda/aoti_cuda.def
diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h
@@ -0,0 +1,53 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+namespace executorch::backends::cuda {
+
+class ET_EXPERIMENTAL CudaBackend final
+    : public ::executorch::runtime::BackendInterface {
+ private:
+  /**
+   * Load AOTI function pointers from the shared library into the handle.
+   */
+  ::executorch::runtime::Error load_function_pointers_into_handle(
+      void* so_handle,
+      struct AOTIDelegateHandle* handle) const;
+
+ public:
+  /**
+   * Check if the CUDA backend is available.
+   */
+  bool is_available() const override;
+
+  /**
+   * Initialize the backend with the given context and compile specs.
+   * Called once per loaded binary blob.
+   */
+  ::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init(
+      ::executorch::runtime::BackendInitContext& context,
+      ::executorch::runtime::FreeableBuffer* processed,
+      ::executorch::runtime::ArrayRef<::executorch::runtime::CompileSpec>
+          compile_specs) const override;
+
+  /**
+   * Execute the backend with the given context and arguments.
+   * Called once per execution.
+   */
+  ::executorch::runtime::Error execute(
+      ::executorch::runtime::BackendExecutionContext& context,
+      ::executorch::runtime::DelegateHandle* handle,
+      ::executorch::runtime::Span<::executorch::runtime::EValue*> args)
+      const override;
+
+  /**
+   * Destroy the backend handle and clean up resources.
+   */
+  void destroy(::executorch::runtime::DelegateHandle* handle) const override;
+};
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -19,10 +19,10 @@
 #include <string>
 #include <vector>
 
-// Include our shim layer headers
+// Include class header and shim layer headers
+#include <executorch/backends/cuda/runtime/CudaBackend.h>
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/cuda/runtime/cuda_backend_init.h>
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
@@ -48,12 +48,9 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
-class ET_EXPERIMENTAL CudaBackend final
-    : public ::executorch::runtime::BackendInterface {
- private:
-  Error load_function_pointers_into_handle(
-      void* so_handle,
-      AOTIDelegateHandle* handle) const {
+Error CudaBackend::load_function_pointers_into_handle(
+    void* so_handle,
+    AOTIDelegateHandle* handle) const {
 #define LOAD_SYMBOL(member, name)                                    \
   do {                                                               \
     auto symbol_res = get_function(so_handle, #name);                \
@@ -88,17 +85,14 @@ class ET_EXPERIMENTAL CudaBackend final
     return Error::Ok;
   }
 
- public:
-  bool is_available() const override {
-    return 1;
-  }
+bool CudaBackend::is_available() const {
+  return 1;
+}
 
-  // Once per loaded binary blob
-  Result<DelegateHandle*> init(
+Result<DelegateHandle*> CudaBackend::init(
       BackendInitContext& context,
-      FreeableBuffer* processed, // This will be a empty buffer
-      ArrayRef<CompileSpec> compile_specs // This will be my empty list
-  ) const override {
+      FreeableBuffer* processed,
+      ArrayRef<CompileSpec> compile_specs) const {
     std::string method_name;
     for (const CompileSpec& spec : compile_specs) {
       if (std::strcmp(spec.key, "method_name") == 0) {
@@ -196,11 +190,10 @@ class ET_EXPERIMENTAL CudaBackend final
     return (DelegateHandle*)handle; // Return the handle post-processing
   }
 
-  // Once per execution
-  Error execute(
-      BackendExecutionContext& context,
-      DelegateHandle* handle_,
-      Span<EValue*> args) const override {
+Error CudaBackend::execute(
+    BackendExecutionContext& context,
+    DelegateHandle* handle_,
+    Span<EValue*> args) const {
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
     size_t n_inputs;
@@ -322,7 +315,7 @@ class ET_EXPERIMENTAL CudaBackend final
     return Error::Ok;
   }
 
-  void destroy(DelegateHandle* handle_) const override {
+void CudaBackend::destroy(DelegateHandle* handle_) const {
     if (handle_ == nullptr) {
       return;
     }
@@ -367,54 +360,21 @@ class ET_EXPERIMENTAL CudaBackend final
     delete handle;
     clear_all_tensors();
   }
-};
 
 } // namespace executorch::backends::cuda
 
 namespace executorch::backends {
-namespace {
-// Static backend instance and registration
+
+// Backend instance - static on all platforms
 auto cls = cuda::CudaBackend();
-executorch::runtime::Backend backend{"CudaBackend", &cls};
+executorch::runtime::Backend cuda_backend{"CudaBackend", &cls};
 
 #ifndef _WIN32
-// On non-Windows platforms, use static initialization
+// On non-Windows platforms, use automatic static initialization
+namespace {
 static executorch::runtime::Error success_with_compiler =
-    register_backend(backend);
-#endif
-
+    register_backend(cuda_backend);
 } // namespace
-
-// InitCudaBackend is exported for explicit backend registration on Windows
-extern "C" CUDA_BACKEND_INIT_API void InitCudaBackend() {
-  // Log immediately to confirm function is entered
-  ET_LOG(Info, "InitCudaBackend: Function entered");
-  assert(1==2);
-  
-#ifdef _WIN32
-  ET_LOG(Info, "InitCudaBackend: Windows path");
-  // On Windows, explicitly register the backend since DLL static initializers
-  // don't run reliably
-  static bool initialized = false;
-  if (!initialized) {
-    ET_LOG(Info, "Registering CUDA backend on Windows");
-    auto error = register_backend(backend);
-    if (error == executorch::runtime::Error::Ok) {
-      ET_LOG(Info, "Successfully registered CudaBackend");
-    } else {
-      ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)error);
-    }
-    initialized = true;
-  } else {
-    ET_LOG(Info, "CUDA backend already initialized");
-  }
-#else
-  ET_LOG(Info, "InitCudaBackend: Non-Windows path");
-  // On other platforms, static initialization already happened
-  (void)success_with_compiler;
 #endif
-  
-  ET_LOG(Info, "InitCudaBackend: Function exiting");
-}
 
 } // namespace executorch::backends
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
@@ -25,6 +25,10 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/log.h>
 
+// Manually register the CUDA backend
+#include <executorch/backends/cuda/runtime/CudaBackend.h>
+#include <executorch/runtime/backend/interface.h>
+
 #if defined(ET_USE_THREADPOOL)
 #include <executorch/extension/threadpool/cpuinfo_utils.h>
 #include <executorch/extension/threadpool/threadpool.h>
@@ -282,14 +286,18 @@ MultimodalInput processAudioFile(
 
 } // namespace
 
-#include <executorch/backends/cuda/runtime/cuda_backend_init.h>
 
 int32_t main(int32_t argc, char** argv) {
-  // On Windows, explicitly initialize the CUDA backend to ensure
-  // static initializers in the DLL run
-  ET_LOG(Info, "About to call InitCudaBackend");
-  InitCudaBackend();
-  ET_LOG(Info, "InitCudaBackend returned");
+  // Manually register the CUDA backend (required on Windows, harmless on other platforms)
+  ET_LOG(Info, "Registering CUDA backend");
+  static auto cuda_backend_impl = ::executorch::backends::cuda::CudaBackend();
+  static auto cuda_backend = ::executorch::runtime::Backend{"CudaBackend", &cuda_backend_impl};
+  auto error = ::executorch::runtime::register_backend(cuda_backend);
+  if (error == ::executorch::runtime::Error::Ok) {
+    ET_LOG(Info, "Successfully registered CudaBackend");
+  } else {
+    ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)error);
+  }
   
   gflags::ParseCommandLineFlags(&argc, &argv, true);