From 118cb0e66714f9059a3a82e9492cddfbbc7a6c8a Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 10:53:48 -0700
Subject: [PATCH 01/34] dont build custom or quantized ops on msvc cuda

---
 examples/models/voxtral/CMakeLists.txt | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 866d17160ba..76569740354 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -39,18 +39,16 @@ executorch_target_link_options_shared_lib(executorch)
 set(link_libraries executorch gflags)
 set(_srcs multimodal.cpp)
 
-list(
-  APPEND
-  link_libraries
-  optimized_native_cpu_ops_lib
-  quantized_ops_lib
-  custom_ops
-  cpublas
-  eigen_blas
-)
+# Common ops for all builds
+list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
 executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
-executorch_target_link_options_shared_lib(quantized_ops_lib)
-executorch_target_link_options_shared_lib(custom_ops)
+
+# CPU-only builds need quantized and custom ops
+if(NOT EXECUTORCH_BUILD_CUDA AND MSVC)
+  list(APPEND link_libraries quantized_ops_lib custom_ops)
+  executorch_target_link_options_shared_lib(quantized_ops_lib)
+  executorch_target_link_options_shared_lib(custom_ops)
+endif()
 
 # XNNPACK
 if(TARGET xnnpack_backend)

From cd93bdae20d0817431deafb0cc8a917ce320cb7a Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 12:54:23 -0700
Subject: [PATCH 02/34] dll attempt 1

---
 backends/cuda/CMakeLists.txt           |  9 ++++++++-
 examples/models/voxtral/CMakeLists.txt | 10 ++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 8b94351d469..c264540266f 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -44,7 +44,14 @@ set(_aoti_cuda_sources
     runtime/shims/int4mm.cu
     runtime/platform/platform.cpp
 )
-add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+# Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC
+if(MSVC)
+  add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
+  # Define export macro for Windows DLL
+  target_compile_definitions(aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS)
+else()
+  add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+endif()
 target_include_directories(
   aoti_cuda
   PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 76569740354..4e4367975fe 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -88,7 +88,17 @@ list(
 if(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)
   list(APPEND link_libraries aoti_cuda)
+  # Handle both static library and shared library (.dll on Windows MSVC)
   executorch_target_link_options_shared_lib(aoti_cuda)
+  # On MSVC, ensure the DLL is copied to the output directory
+  if(MSVC)
+    add_custom_command(TARGET voxtral_runner POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        $<TARGET_FILE:aoti_cuda>
+        $<TARGET_FILE_DIR:voxtral_runner>
+      COMMENT "Copying aoti_cuda.dll to output directory"
+    )
+  endif()
 endif()
 
 if(EXECUTORCH_BUILD_METAL)

From 0af3f62c59b4dae5d2a18e2e6d4be43fea04c224 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 13:19:15 -0700
Subject: [PATCH 03/34] add export macro to header definitions

---
 backends/cuda/runtime/export.h                | 24 +++++++++++++++++++
 backends/cuda/runtime/shims/cuda_guard.h      | 13 +++++-----
 backends/cuda/runtime/shims/int4mm.h          |  3 ++-
 backends/cuda/runtime/shims/memory.h          | 13 +++++-----
 .../cuda/runtime/shims/tensor_attribute.h     |  5 ++--
 5 files changed, 43 insertions(+), 15 deletions(-)
 create mode 100644 backends/cuda/runtime/export.h

diff --git a/backends/cuda/runtime/export.h b/backends/cuda/runtime/export.h
new file mode 100644
index 00000000000..23623888e85
--- /dev/null
+++ b/backends/cuda/runtime/export.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// Define export macro for Windows DLL
+// When building the aoti_cuda library, EXPORT_AOTI_FUNCTIONS is defined by CMake,
+// which causes this macro to export symbols using __declspec(dllexport).
+// When consuming the library, the macro imports symbols using __declspec(dllimport).
+// On non-Windows platforms, the macro is empty and has no effect.
+#ifdef _WIN32
+#ifdef EXPORT_AOTI_FUNCTIONS
+#define AOTI_CUDA_EXPORT __declspec(dllexport)
+#else
+#define AOTI_CUDA_EXPORT __declspec(dllimport)
+#endif
+#else
+#define AOTI_CUDA_EXPORT
+#endif
diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h
index f930f3df643..932ce6ab785 100644
--- a/backends/cuda/runtime/shims/cuda_guard.h
+++ b/backends/cuda/runtime/shims/cuda_guard.h
@@ -11,6 +11,7 @@
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/backends/cuda/runtime/export.h>
 #include <cstdint>
 
 namespace executorch::backends::cuda {
@@ -33,7 +34,7 @@ using CUDAStreamGuardHandle = CUDAStreamGuard*;
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_create_cuda_guard(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_guard(
     int32_t device_index,
     CUDAGuardHandle* ret_guard);
 
@@ -44,7 +45,7 @@ AOTITorchError aoti_torch_create_cuda_guard(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
 
 /**
  * Sets the CUDA device to a new index for an existing guard.
@@ -54,7 +55,7 @@ AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_cuda_guard_set_index(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_cuda_guard_set_index(
     CUDAGuardHandle guard,
     int32_t device_index);
 
@@ -69,7 +70,7 @@ AOTITorchError aoti_torch_cuda_guard_set_index(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_create_cuda_stream_guard(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard(
     void* stream,
     int32_t device_index,
     CUDAStreamGuardHandle* ret_guard);
@@ -81,7 +82,7 @@ AOTITorchError aoti_torch_create_cuda_stream_guard(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
 
 /**
  * Gets the current CUDA stream for a specified device.
@@ -91,7 +92,7 @@ AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_get_current_cuda_stream(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_get_current_cuda_stream(
     int32_t device_index,
     void** ret_stream);
 
diff --git a/backends/cuda/runtime/shims/int4mm.h b/backends/cuda/runtime/shims/int4mm.h
index 6bd2d9b3a79..b84a7a4d149 100644
--- a/backends/cuda/runtime/shims/int4mm.h
+++ b/backends/cuda/runtime/shims/int4mm.h
@@ -10,6 +10,7 @@
 
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/export.h>
 
 namespace executorch::backends::cuda {
 
@@ -69,7 +70,7 @@ extern "C" {
  *     or invalid qGroupSize
  *   - Error::Internal: CUDA kernel launch failure
  */
-AOTITorchError aoti_torch_cuda__weight_int4pack_mm(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm(
     Tensor* self,
     Tensor* mat2,
     int64_t qGroupSize,
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
index 7a8d4c3609b..924f268d3ca 100644
--- a/backends/cuda/runtime/shims/memory.h
+++ b/backends/cuda/runtime/shims/memory.h
@@ -10,6 +10,7 @@
 
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/export.h>
 #include <cstdint>
 
 namespace executorch::backends::cuda {
@@ -43,7 +44,7 @@ extern "C" {
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     void* data,
     int64_t ndim,
     const int64_t* sizes_ptr,
@@ -71,7 +72,7 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_empty_strided(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_empty_strided(
     int64_t ndim,
     const int64_t* sizes_ptr,
     const int64_t* strides_ptr,
@@ -87,7 +88,7 @@ AOTITorchError aoti_torch_empty_strided(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
 
 /**
  * Creates a tensor view that reinterprets the same underlying memory with
@@ -106,7 +107,7 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
  *
  * @return Error::Ok on success, appropriate error code on failure
  */
-AOTITorchError aoti_torch__reinterpret_tensor(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
     Tensor* self,
     int64_t ndim,
     const int64_t* sizes_ptr,
@@ -136,11 +137,11 @@ AOTITorchError aoti_torch__reinterpret_tensor(
  *         - Error::MemoryAllocationFailed: failed to allocate temporary memory
  *         - Error::Internal: CUDA operation failures
  */
-AOTITorchError
+AOTI_CUDA_EXPORT AOTITorchError
 aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
 
 // Function to clear all tensors from internal storage
-void clear_all_tensors();
+AOTI_CUDA_EXPORT void clear_all_tensors();
 } // extern "C"
 
 } // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
index 15a4e397d24..8b4950a154d 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.h
+++ b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -10,6 +10,7 @@
 
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
+#include <executorch/backends/cuda/runtime/export.h>
 #include <cstdint>
 
 namespace executorch::backends::cuda {
@@ -24,12 +25,12 @@ extern "C" {
 using AOTITorchError = Error;
 
 // Device type functions for tensor attributes
-AOTITorchError aoti_torch_get_device_type(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_get_device_type(
     Tensor* tensor,
     int32_t* ret_device_type);
 
 // Device type constants
-int32_t aoti_torch_device_type_cuda();
+AOTI_CUDA_EXPORT int32_t aoti_torch_device_type_cuda();
 
 } // extern "C"
 

From 33682489c9669e4541b9cb3f759b1bc651926d66 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 13:23:55 -0700
Subject: [PATCH 04/34] voxtral cmake

---
 examples/models/voxtral/CMakeLists.txt | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 4e4367975fe..e2684b05a5c 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -90,15 +90,6 @@ if(EXECUTORCH_BUILD_CUDA)
   list(APPEND link_libraries aoti_cuda)
   # Handle both static library and shared library (.dll on Windows MSVC)
   executorch_target_link_options_shared_lib(aoti_cuda)
-  # On MSVC, ensure the DLL is copied to the output directory
-  if(MSVC)
-    add_custom_command(TARGET voxtral_runner POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        $<TARGET_FILE:aoti_cuda>
-        $<TARGET_FILE_DIR:voxtral_runner>
-      COMMENT "Copying aoti_cuda.dll to output directory"
-    )
-  endif()
 endif()
 
 if(EXECUTORCH_BUILD_METAL)
@@ -110,6 +101,16 @@ endif()
 list(APPEND link_libraries tokenizers::tokenizers)
 
 add_executable(voxtral_runner ${_srcs})
+
+# On MSVC, ensure the aoti_cuda DLL is copied to the output directory
+if(EXECUTORCH_BUILD_CUDA AND MSVC)
+  add_custom_command(TARGET voxtral_runner POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+      $<TARGET_FILE:aoti_cuda>
+      $<TARGET_FILE_DIR:voxtral_runner>
+    COMMENT "Copying aoti_cuda.dll to output directory"
+  )
+endif()
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(voxtral_runner)
   if(NOT APPLE)

From 450b6fb15653501efaadf49de80fbdbd06d35199 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 13:43:00 -0700
Subject: [PATCH 05/34] cmake dll install changes

---
 backends/cuda/CMakeLists.txt           | 11 ++++++++++-
 examples/models/voxtral/CMakeLists.txt | 10 ----------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index c264540266f..955991f9f18 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -49,6 +49,13 @@ if(MSVC)
   add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
   # Define export macro for Windows DLL
   target_compile_definitions(aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS)
+  # Ensure proper DLL import/export library naming on Windows
+  set_target_properties(aoti_cuda PROPERTIES
+    WINDOWS_EXPORT_ALL_SYMBOLS OFF  # We use explicit exports via AOTI_CUDA_EXPORT
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+  )
 else()
   add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
 endif()
@@ -89,5 +96,7 @@ endif()
 install(
   TARGETS aoti_cuda
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  LIBRARY DESTINATION lib
+  ARCHIVE DESTINATION lib
+  RUNTIME DESTINATION bin
 )
diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index e2684b05a5c..c1e52c9095c 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -101,16 +101,6 @@ endif()
 list(APPEND link_libraries tokenizers::tokenizers)
 
 add_executable(voxtral_runner ${_srcs})
-
-# On MSVC, ensure the aoti_cuda DLL is copied to the output directory
-if(EXECUTORCH_BUILD_CUDA AND MSVC)
-  add_custom_command(TARGET voxtral_runner POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different
-      $<TARGET_FILE:aoti_cuda>
-      $<TARGET_FILE_DIR:voxtral_runner>
-    COMMENT "Copying aoti_cuda.dll to output directory"
-  )
-endif()
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(voxtral_runner)
   if(NOT APPLE)

From 0bbcd7194eef65fcd5dd0b48404ec253fd7722cc Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 14:13:36 -0700
Subject: [PATCH 06/34] more cmake stuff to try

---
 examples/models/voxtral/CMakeLists.txt | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index c1e52c9095c..978d78a8ba3 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -88,8 +88,11 @@ list(
 if(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)
   list(APPEND link_libraries aoti_cuda)
-  # Handle both static library and shared library (.dll on Windows MSVC)
-  executorch_target_link_options_shared_lib(aoti_cuda)
+  # CMake will automatically use the import library on Windows
+  if(NOT MSVC)
+    # Only apply shared lib linking options on non-Windows platforms
+    executorch_target_link_options_shared_lib(aoti_cuda)
+  endif()
 endif()
 
 if(EXECUTORCH_BUILD_METAL)
@@ -103,7 +106,7 @@ list(APPEND link_libraries tokenizers::tokenizers)
 add_executable(voxtral_runner ${_srcs})
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(voxtral_runner)
-  if(NOT APPLE)
+  if(NOT APPLE AND NOT MSVC)
     target_link_options(voxtral_runner PRIVATE "LINKER:-s")
   endif()
 endif()

From efcdfa1c0914623c1eb99e726a563dace02fb7d1 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 15:01:49 -0700
Subject: [PATCH 07/34] hacky work to get backend registration

---
 backends/cuda/runtime/cuda_backend.cpp | 13 +++++++++++++
 examples/models/voxtral/CMakeLists.txt | 22 +++++++++++-----------
 examples/models/voxtral/multimodal.cpp |  9 +++++++++
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 0cef859ddfb..a72d866e261 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -376,4 +376,17 @@ executorch::runtime::Backend backend{"CudaBackend", &cls};
 static executorch::runtime::Error success_with_compiler =
     register_backend(backend);
 } // namespace
+
+// Export an initialization function to ensure static initializers run on Windows
+#ifdef _WIN32
+#define CUDA_BACKEND_INIT_EXPORT __declspec(dllexport)
+#else
+#define CUDA_BACKEND_INIT_EXPORT __attribute__((visibility("default")))
+#endif
+
+extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() {
+  // Force the static initializer to run by referencing it
+  (void)success_with_compiler;
+}
+
 } // namespace executorch::backends
diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 978d78a8ba3..3639d938db9 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -84,17 +84,6 @@ list(
   extension_flat_tensor
 )
 
-# Link CUDA backend
-if(EXECUTORCH_BUILD_CUDA)
-  find_package(CUDAToolkit REQUIRED)
-  list(APPEND link_libraries aoti_cuda)
-  # CMake will automatically use the import library on Windows
-  if(NOT MSVC)
-    # Only apply shared lib linking options on non-Windows platforms
-    executorch_target_link_options_shared_lib(aoti_cuda)
-  endif()
-endif()
-
 if(EXECUTORCH_BUILD_METAL)
   list(APPEND link_libraries metal_backend)
   executorch_target_link_options_shared_lib(metal_backend)
@@ -111,6 +100,17 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   endif()
 endif()
 
+# Link CUDA backend
+if(EXECUTORCH_BUILD_CUDA AND NOT MSVC)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+elseif(EXECUTORCH_BUILD_CUDA AND MSVC)
+  find_package(CUDAToolkit REQUIRED)
+  target_link_libraries(voxtral_runner PRIVATE aoti_cuda)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
+
 target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
 target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
 target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index 29edf955751..0bfdb60dfca 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -282,7 +282,16 @@ MultimodalInput processAudioFile(
 
 } // namespace
 
+// Forward declare the initialization function from aoti_cuda
+extern "C" void InitCudaBackend();
+
 int32_t main(int32_t argc, char** argv) {
+#ifdef _WIN32
+  // On Windows, explicitly initialize the CUDA backend to ensure
+  // static initializers in the DLL run
+  InitCudaBackend();
+#endif
+  
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   const char* model_path = FLAGS_model_path.c_str();

From 0b000cd216bd5a51e4d8efc6c064e8b097094987 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 16:02:45 -0700
Subject: [PATCH 08/34] more cmake checks

---
 backends/cuda/CMakeLists.txt           |  6 +++++-
 examples/models/voxtral/CMakeLists.txt | 16 +++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 955991f9f18..f55d94ec165 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -82,7 +82,11 @@ target_link_libraries(
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
-executorch_target_link_options_shared_lib(aoti_cuda)
+
+# Only apply shared lib options on non-Windows platforms
+if(NOT MSVC)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
 
 if(BUILD_TESTING)
   # Add runtime
diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 3639d938db9..cfb2acd3759 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -101,14 +101,16 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif()
 
 # Link CUDA backend
-if(EXECUTORCH_BUILD_CUDA AND NOT MSVC)
+if(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)
-  list(APPEND link_libraries aoti_cuda)
-  executorch_target_link_options_shared_lib(aoti_cuda)
-elseif(EXECUTORCH_BUILD_CUDA AND MSVC)
-  find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(voxtral_runner PRIVATE aoti_cuda)
-  executorch_target_link_options_shared_lib(aoti_cuda)
+  if(MSVC)
+    # On MSVC, link directly without shared lib options to use import library
+    list(APPEND link_libraries aoti_cuda)
+  else()
+    # On non-MSVC, use shared lib options
+    list(APPEND link_libraries aoti_cuda)
+    executorch_target_link_options_shared_lib(aoti_cuda)
+  endif()
 endif()
 
 target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})

From 6fca29a27341882d27d8ccb686ddb038954b15b0 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 16:08:17 -0700
Subject: [PATCH 09/34] static init hacks

---
 backends/cuda/runtime/cuda_backend.cpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index a72d866e261..1e6325fe277 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -371,13 +371,19 @@ class ET_EXPERIMENTAL CudaBackend final
 
 namespace executorch::backends {
 namespace {
+// Static backend instance and registration
 auto cls = cuda::CudaBackend();
 executorch::runtime::Backend backend{"CudaBackend", &cls};
+
+#ifndef _WIN32
+// On non-Windows platforms, use static initialization
 static executorch::runtime::Error success_with_compiler =
     register_backend(backend);
+#endif
+
 } // namespace
 
-// Export an initialization function to ensure static initializers run on Windows
+// Export an initialization function for explicit backend registration
 #ifdef _WIN32
 #define CUDA_BACKEND_INIT_EXPORT __declspec(dllexport)
 #else
@@ -385,8 +391,18 @@ static executorch::runtime::Error success_with_compiler =
 #endif
 
 extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() {
-  // Force the static initializer to run by referencing it
+#ifdef _WIN32
+  // On Windows, explicitly register the backend since DLL static initializers
+  // don't run reliably
+  static bool initialized = false;
+  if (!initialized) {
+    register_backend(backend);
+    initialized = true;
+  }
+#else
+  // On other platforms, static initialization already happened
   (void)success_with_compiler;
+#endif
 }
 
 } // namespace executorch::backends

From afadb48000b7cb893cee70005dcbdcf2b44e814f Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 16:11:02 -0700
Subject: [PATCH 10/34] add log

---
 backends/cuda/runtime/cuda_backend.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 1e6325fe277..945dfda4209 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -391,6 +391,9 @@ static executorch::runtime::Error success_with_compiler =
 #endif
 
 extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() {
+  ET_LOG(
+      Info,
+      "CALLING INITCUDABACKEND");
 #ifdef _WIN32
   // On Windows, explicitly register the backend since DLL static initializers
   // don't run reliably

From 357bf9f1ba44faafb46e215a0e27e6dd2795f20e Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 16:15:40 -0700
Subject: [PATCH 11/34] more hacks

---
 examples/models/voxtral/multimodal.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index 0bfdb60dfca..90663ee2158 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -283,13 +283,19 @@ MultimodalInput processAudioFile(
 } // namespace
 
 // Forward declare the initialization function from aoti_cuda
+#ifdef _WIN32
+extern "C" __declspec(dllimport) void InitCudaBackend();
+#else
 extern "C" void InitCudaBackend();
+#endif
 
 int32_t main(int32_t argc, char** argv) {
 #ifdef _WIN32
   // On Windows, explicitly initialize the CUDA backend to ensure
   // static initializers in the DLL run
+  ET_LOG(Info, "About to call InitCudaBackend");
   InitCudaBackend();
+  ET_LOG(Info, "InitCudaBackend returned");
 #endif
   
   gflags::ParseCommandLineFlags(&argc, &argv, true);

From d2d8acf021899ecd80cda51a431254a4cbfd0858 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 16:18:36 -0700
Subject: [PATCH 12/34] hacks

---
 backends/cuda/runtime/cuda_backend.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 945dfda4209..5454213dfca 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -391,19 +391,25 @@ static executorch::runtime::Error success_with_compiler =
 #endif
 
 extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() {
-  ET_LOG(
-      Info,
-      "CALLING INITCUDABACKEND");
 #ifdef _WIN32
   // On Windows, explicitly register the backend since DLL static initializers
   // don't run reliably
   static bool initialized = false;
   if (!initialized) {
-    register_backend(backend);
+    ET_LOG(Info, "Registering CUDA backend on Windows");
+    auto error = register_backend(backend);
+    if (error == executorch::runtime::Error::Ok) {
+      ET_LOG(Info, "Successfully registered CudaBackend");
+    } else {
+      ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)error);
+    }
     initialized = true;
+  } else {
+    ET_LOG(Info, "CUDA backend already initialized");
   }
 #else
   // On other platforms, static initialization already happened
+  ET_LOG(Info, "CUDA backend using static initialization");
   (void)success_with_compiler;
 #endif
 }

From 3f56f132c982ef17002189916b25f76144e84ece Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 16:25:54 -0700
Subject: [PATCH 13/34] config specific lib and bin locations

---
 backends/cuda/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index f55d94ec165..7d5cfbedca1 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -49,12 +49,12 @@ if(MSVC)
   add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
   # Define export macro for Windows DLL
   target_compile_definitions(aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS)
-  # Ensure proper DLL import/export library naming on Windows
+  # Ensure proper DLL import/export library naming on Windows with config-specific paths
   set_target_properties(aoti_cuda PROPERTIES
     WINDOWS_EXPORT_ALL_SYMBOLS OFF  # We use explicit exports via AOTI_CUDA_EXPORT
-    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin
-    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
-    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$<CONFIG>
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
+    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
   )
 else()
   add_library(aoti_cuda STATIC ${_aoti_cuda_sources})

From b3b654d40b17c63e9703a3b2d7c4bf1d1d6dd8aa Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 16:34:35 -0700
Subject: [PATCH 14/34] hacks

---
 backends/cuda/runtime/cuda_backend.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 5454213dfca..f9be82c4140 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -391,7 +391,11 @@ static executorch::runtime::Error success_with_compiler =
 #endif
 
 extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() {
+  // Log immediately to confirm function is entered
+  ET_LOG(Info, "InitCudaBackend: Function entered");
+  
 #ifdef _WIN32
+  ET_LOG(Info, "InitCudaBackend: Windows path");
   // On Windows, explicitly register the backend since DLL static initializers
   // don't run reliably
   static bool initialized = false;
@@ -408,10 +412,12 @@ extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() {
     ET_LOG(Info, "CUDA backend already initialized");
   }
 #else
+  ET_LOG(Info, "InitCudaBackend: Non-Windows path");
   // On other platforms, static initialization already happened
-  ET_LOG(Info, "CUDA backend using static initialization");
   (void)success_with_compiler;
 #endif
+  
+  ET_LOG(Info, "InitCudaBackend: Function exiting");
 }
 
 } // namespace executorch::backends

From cac7849d115b05502a1b237a6d7148b6d846417b Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 16:51:33 -0700
Subject: [PATCH 15/34] hacks

---
 backends/cuda/CMakeLists.txt              |  7 +++--
 backends/cuda/runtime/cuda_backend.cpp    | 11 ++-----
 backends/cuda/runtime/cuda_backend_init.h | 35 +++++++++++++++++++++++
 examples/models/voxtral/multimodal.cpp    |  9 ++----
 4 files changed, 46 insertions(+), 16 deletions(-)
 create mode 100644 backends/cuda/runtime/cuda_backend_init.h

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 7d5cfbedca1..9befbe5cc56 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -47,8 +47,11 @@ set(_aoti_cuda_sources
 # Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC
 if(MSVC)
   add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
-  # Define export macro for Windows DLL
-  target_compile_definitions(aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS)
+  # Define export macros for Windows DLL
+  target_compile_definitions(aoti_cuda PRIVATE
+    EXPORT_AOTI_FUNCTIONS
+    BUILDING_CUDA_BACKEND
+  )
   # Ensure proper DLL import/export library naming on Windows with config-specific paths
   set_target_properties(aoti_cuda PROPERTIES
     WINDOWS_EXPORT_ALL_SYMBOLS OFF  # We use explicit exports via AOTI_CUDA_EXPORT
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index f9be82c4140..ed55086e56f 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -21,6 +21,7 @@
 // Include our shim layer headers
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/cuda_backend_init.h>
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
@@ -383,14 +384,8 @@ static executorch::runtime::Error success_with_compiler =
 
 } // namespace
 
-// Export an initialization function for explicit backend registration
-#ifdef _WIN32
-#define CUDA_BACKEND_INIT_EXPORT __declspec(dllexport)
-#else
-#define CUDA_BACKEND_INIT_EXPORT __attribute__((visibility("default")))
-#endif
-
-extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() {
+// InitCudaBackend is exported for explicit backend registration on Windows
+extern "C" CUDA_BACKEND_INIT_API void InitCudaBackend() {
   // Log immediately to confirm function is entered
   ET_LOG(Info, "InitCudaBackend: Function entered");
   
diff --git a/backends/cuda/runtime/cuda_backend_init.h b/backends/cuda/runtime/cuda_backend_init.h
new file mode 100644
index 00000000000..a05dc082857
--- /dev/null
+++ b/backends/cuda/runtime/cuda_backend_init.h
@@ -0,0 +1,35 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+#define CUDA_BACKEND_INIT_EXPORT __declspec(dllexport)
+#define CUDA_BACKEND_INIT_IMPORT __declspec(dllimport)
+#else
+#define CUDA_BACKEND_INIT_EXPORT __attribute__((visibility("default")))
+#define CUDA_BACKEND_INIT_IMPORT
+#endif
+
+// When building the DLL, define BUILDING_CUDA_BACKEND
+// When using the DLL, this will import the function
+#ifdef BUILDING_CUDA_BACKEND
+#define CUDA_BACKEND_INIT_API CUDA_BACKEND_INIT_EXPORT
+#else
+#define CUDA_BACKEND_INIT_API CUDA_BACKEND_INIT_IMPORT
+#endif
+
+/**
+ * Initialize the CUDA backend and register it with the ExecutorTorch runtime.
+ * On Windows, this must be called explicitly before loading models that use
+ * the CUDA backend. On other platforms, the backend is registered automatically
+ * via static initialization.
+ */
+CUDA_BACKEND_INIT_API void InitCudaBackend();
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index 90663ee2158..9057707b41c 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -282,15 +282,12 @@ MultimodalInput processAudioFile(
 
 } // namespace
 
-// Forward declare the initialization function from aoti_cuda
-#ifdef _WIN32
-extern "C" __declspec(dllimport) void InitCudaBackend();
-#else
-extern "C" void InitCudaBackend();
+#ifdef EXECUTORCH_BUILD_CUDA
+#include <executorch/backends/cuda/runtime/cuda_backend_init.h>
 #endif
 
 int32_t main(int32_t argc, char** argv) {
-#ifdef _WIN32
+#ifdef EXECUTORCH_BUILD_CUDA
   // On Windows, explicitly initialize the CUDA backend to ensure
   // static initializers in the DLL run
   ET_LOG(Info, "About to call InitCudaBackend");

From 6040e3ab5cc0e9cc01a55df54961301ef58a3697 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 17:05:19 -0700
Subject: [PATCH 16/34] defs hack

---
 backends/cuda/CMakeLists.txt           | 4 ++--
 backends/cuda/aoti_cuda.def            | 3 +++
 backends/cuda/runtime/cuda_backend.cpp | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)
 create mode 100644 backends/cuda/aoti_cuda.def

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 9befbe5cc56..36625c0a038 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -46,7 +46,7 @@ set(_aoti_cuda_sources
 )
 # Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC
 if(MSVC)
-  add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
+  add_library(aoti_cuda SHARED ${_aoti_cuda_sources} ${CMAKE_CURRENT_SOURCE_DIR}/aoti_cuda.def)
   # Define export macros for Windows DLL
   target_compile_definitions(aoti_cuda PRIVATE
     EXPORT_AOTI_FUNCTIONS
@@ -54,7 +54,7 @@ if(MSVC)
   )
   # Ensure proper DLL import/export library naming on Windows with config-specific paths
   set_target_properties(aoti_cuda PROPERTIES
-    WINDOWS_EXPORT_ALL_SYMBOLS OFF  # We use explicit exports via AOTI_CUDA_EXPORT
+    WINDOWS_EXPORT_ALL_SYMBOLS OFF  # We use explicit exports via AOTI_CUDA_EXPORT and .def file
     RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$<CONFIG>
     LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
     ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
diff --git a/backends/cuda/aoti_cuda.def b/backends/cuda/aoti_cuda.def
new file mode 100644
index 00000000000..a1f7433e077
--- /dev/null
+++ b/backends/cuda/aoti_cuda.def
@@ -0,0 +1,3 @@
+LIBRARY aoti_cuda
+EXPORTS
+    InitCudaBackend
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index ed55086e56f..1545347bc3d 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -12,6 +12,7 @@
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <cstdio>
+#include <cassert>
 
 #include <filesystem>
 #include <fstream>
@@ -388,6 +389,7 @@ static executorch::runtime::Error success_with_compiler =
 extern "C" CUDA_BACKEND_INIT_API void InitCudaBackend() {
   // Log immediately to confirm function is entered
   ET_LOG(Info, "InitCudaBackend: Function entered");
+  assert(1==2)
   
 #ifdef _WIN32
   ET_LOG(Info, "InitCudaBackend: Windows path");

From 25667f14a4ce911dd52c5b1a835f356ddc2df6a7 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 17:06:13 -0700
Subject: [PATCH 17/34] remove cuda ifdef

---
 examples/models/voxtral/multimodal.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index 9057707b41c..fd6250a41d9 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -282,18 +282,14 @@ MultimodalInput processAudioFile(
 
 } // namespace
 
-#ifdef EXECUTORCH_BUILD_CUDA
 #include <executorch/backends/cuda/runtime/cuda_backend_init.h>
-#endif
 
 int32_t main(int32_t argc, char** argv) {
-#ifdef EXECUTORCH_BUILD_CUDA
   // On Windows, explicitly initialize the CUDA backend to ensure
   // static initializers in the DLL run
   ET_LOG(Info, "About to call InitCudaBackend");
   InitCudaBackend();
   ET_LOG(Info, "InitCudaBackend returned");
-#endif
   
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 

From 1ae9deda5c35eb41ebe331630c3ceebf6bcc84b1 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 17:10:02 -0700
Subject: [PATCH 18/34] missing semicolon

---
 backends/cuda/runtime/cuda_backend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 1545347bc3d..7f324b4135a 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -389,7 +389,7 @@ static executorch::runtime::Error success_with_compiler =
 extern "C" CUDA_BACKEND_INIT_API void InitCudaBackend() {
   // Log immediately to confirm function is entered
   ET_LOG(Info, "InitCudaBackend: Function entered");
-  assert(1==2)
+  assert(1==2);
   
 #ifdef _WIN32
   ET_LOG(Info, "InitCudaBackend: Windows path");

From 53dc564cecd5f57257a0a868326c0c88f454e42a Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 17:37:38 -0700
Subject: [PATCH 19/34] trying something else

---
 backends/cuda/CMakeLists.txt           |  4 +-
 backends/cuda/aoti_cuda.def            |  3 -
 backends/cuda/runtime/CudaBackend.h    | 53 ++++++++++++++++
 backends/cuda/runtime/cuda_backend.cpp | 84 +++++++-------------------
 examples/models/voxtral/multimodal.cpp | 20 ++++--
 5 files changed, 91 insertions(+), 73 deletions(-)
 delete mode 100644 backends/cuda/aoti_cuda.def
 create mode 100644 backends/cuda/runtime/CudaBackend.h

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 36625c0a038..9befbe5cc56 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -46,7 +46,7 @@ set(_aoti_cuda_sources
 )
 # Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC
 if(MSVC)
-  add_library(aoti_cuda SHARED ${_aoti_cuda_sources} ${CMAKE_CURRENT_SOURCE_DIR}/aoti_cuda.def)
+  add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
   # Define export macros for Windows DLL
   target_compile_definitions(aoti_cuda PRIVATE
     EXPORT_AOTI_FUNCTIONS
@@ -54,7 +54,7 @@ if(MSVC)
   )
   # Ensure proper DLL import/export library naming on Windows with config-specific paths
   set_target_properties(aoti_cuda PROPERTIES
-    WINDOWS_EXPORT_ALL_SYMBOLS OFF  # We use explicit exports via AOTI_CUDA_EXPORT and .def file
+    WINDOWS_EXPORT_ALL_SYMBOLS OFF  # We use explicit exports via AOTI_CUDA_EXPORT
     RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$<CONFIG>
     LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
     ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
diff --git a/backends/cuda/aoti_cuda.def b/backends/cuda/aoti_cuda.def
deleted file mode 100644
index a1f7433e077..00000000000
--- a/backends/cuda/aoti_cuda.def
+++ /dev/null
@@ -1,3 +0,0 @@
-LIBRARY aoti_cuda
-EXPORTS
-    InitCudaBackend
diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h
new file mode 100644
index 00000000000..ec42191227e
--- /dev/null
+++ b/backends/cuda/runtime/CudaBackend.h
@@ -0,0 +1,53 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+namespace executorch::backends::cuda {
+
+class ET_EXPERIMENTAL CudaBackend final
+    : public ::executorch::runtime::BackendInterface {
+ private:
+  /**
+   * Load AOTI function pointers from the shared library into the handle.
+   */
+  ::executorch::runtime::Error load_function_pointers_into_handle(
+      void* so_handle,
+      struct AOTIDelegateHandle* handle) const;
+
+ public:
+  /**
+   * Check if the CUDA backend is available.
+   */
+  bool is_available() const override;
+
+  /**
+   * Initialize the backend with the given context and compile specs.
+   * Called once per loaded binary blob.
+   */
+  ::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init(
+      ::executorch::runtime::BackendInitContext& context,
+      ::executorch::runtime::FreeableBuffer* processed,
+      ::executorch::runtime::ArrayRef<::executorch::runtime::CompileSpec>
+          compile_specs) const override;
+
+  /**
+   * Execute the backend with the given context and arguments.
+   * Called once per execution.
+   */
+  ::executorch::runtime::Error execute(
+      ::executorch::runtime::BackendExecutionContext& context,
+      ::executorch::runtime::DelegateHandle* handle,
+      ::executorch::runtime::Span<::executorch::runtime::EValue*> args)
+      const override;
+
+  /**
+   * Destroy the backend handle and clean up resources.
+   */
+  void destroy(::executorch::runtime::DelegateHandle* handle) const override;
+};
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 7f324b4135a..5733b6c62e9 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -19,10 +19,10 @@
 #include <string>
 #include <vector>
 
-// Include our shim layer headers
+// Include class header and shim layer headers
+#include <executorch/backends/cuda/runtime/CudaBackend.h>
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/cuda/runtime/cuda_backend_init.h>
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
@@ -48,12 +48,9 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
-class ET_EXPERIMENTAL CudaBackend final
-    : public ::executorch::runtime::BackendInterface {
- private:
-  Error load_function_pointers_into_handle(
-      void* so_handle,
-      AOTIDelegateHandle* handle) const {
+Error CudaBackend::load_function_pointers_into_handle(
+    void* so_handle,
+    AOTIDelegateHandle* handle) const {
 #define LOAD_SYMBOL(member, name)                                    \
   do {                                                               \
     auto symbol_res = get_function(so_handle, #name);                \
@@ -88,17 +85,14 @@ class ET_EXPERIMENTAL CudaBackend final
     return Error::Ok;
   }
 
- public:
-  bool is_available() const override {
-    return 1;
-  }
+bool CudaBackend::is_available() const {
+  return 1;
+}
 
-  // Once per loaded binary blob
-  Result<DelegateHandle*> init(
+Result<DelegateHandle*> CudaBackend::init(
       BackendInitContext& context,
-      FreeableBuffer* processed, // This will be a empty buffer
-      ArrayRef<CompileSpec> compile_specs // This will be my empty list
-  ) const override {
+      FreeableBuffer* processed,
+      ArrayRef<CompileSpec> compile_specs) const {
     std::string method_name;
     for (const CompileSpec& spec : compile_specs) {
       if (std::strcmp(spec.key, "method_name") == 0) {
@@ -196,11 +190,10 @@ class ET_EXPERIMENTAL CudaBackend final
     return (DelegateHandle*)handle; // Return the handle post-processing
   }
 
-  // Once per execution
-  Error execute(
-      BackendExecutionContext& context,
-      DelegateHandle* handle_,
-      Span<EValue*> args) const override {
+Error CudaBackend::execute(
+    BackendExecutionContext& context,
+    DelegateHandle* handle_,
+    Span<EValue*> args) const {
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
     size_t n_inputs;
@@ -322,7 +315,7 @@ class ET_EXPERIMENTAL CudaBackend final
     return Error::Ok;
   }
 
-  void destroy(DelegateHandle* handle_) const override {
+void CudaBackend::destroy(DelegateHandle* handle_) const {
     if (handle_ == nullptr) {
       return;
     }
@@ -367,54 +360,21 @@ class ET_EXPERIMENTAL CudaBackend final
     delete handle;
     clear_all_tensors();
   }
-};
 
 } // namespace executorch::backends::cuda
 
 namespace executorch::backends {
-namespace {
-// Static backend instance and registration
+
+// Backend instance - static on all platforms
 auto cls = cuda::CudaBackend();
-executorch::runtime::Backend backend{"CudaBackend", &cls};
+executorch::runtime::Backend cuda_backend{"CudaBackend", &cls};
 
 #ifndef _WIN32
-// On non-Windows platforms, use static initialization
+// On non-Windows platforms, use automatic static initialization
+namespace {
 static executorch::runtime::Error success_with_compiler =
-    register_backend(backend);
-#endif
-
+    register_backend(cuda_backend);
 } // namespace
-
-// InitCudaBackend is exported for explicit backend registration on Windows
-extern "C" CUDA_BACKEND_INIT_API void InitCudaBackend() {
-  // Log immediately to confirm function is entered
-  ET_LOG(Info, "InitCudaBackend: Function entered");
-  assert(1==2);
-  
-#ifdef _WIN32
-  ET_LOG(Info, "InitCudaBackend: Windows path");
-  // On Windows, explicitly register the backend since DLL static initializers
-  // don't run reliably
-  static bool initialized = false;
-  if (!initialized) {
-    ET_LOG(Info, "Registering CUDA backend on Windows");
-    auto error = register_backend(backend);
-    if (error == executorch::runtime::Error::Ok) {
-      ET_LOG(Info, "Successfully registered CudaBackend");
-    } else {
-      ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)error);
-    }
-    initialized = true;
-  } else {
-    ET_LOG(Info, "CUDA backend already initialized");
-  }
-#else
-  ET_LOG(Info, "InitCudaBackend: Non-Windows path");
-  // On other platforms, static initialization already happened
-  (void)success_with_compiler;
 #endif
-  
-  ET_LOG(Info, "InitCudaBackend: Function exiting");
-}
 
 } // namespace executorch::backends
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index fd6250a41d9..c116c111154 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -25,6 +25,10 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/log.h>
 
+// Manually register the CUDA backend
+#include <executorch/backends/cuda/runtime/CudaBackend.h>
+#include <executorch/runtime/backend/interface.h>
+
 #if defined(ET_USE_THREADPOOL)
 #include <executorch/extension/threadpool/cpuinfo_utils.h>
 #include <executorch/extension/threadpool/threadpool.h>
@@ -282,14 +286,18 @@ MultimodalInput processAudioFile(
 
 } // namespace
 
-#include <executorch/backends/cuda/runtime/cuda_backend_init.h>
 
 int32_t main(int32_t argc, char** argv) {
-  // On Windows, explicitly initialize the CUDA backend to ensure
-  // static initializers in the DLL run
-  ET_LOG(Info, "About to call InitCudaBackend");
-  InitCudaBackend();
-  ET_LOG(Info, "InitCudaBackend returned");
+  // Manually register the CUDA backend (required on Windows, harmless on other platforms)
+  ET_LOG(Info, "Registering CUDA backend");
+  static auto cuda_backend_impl = ::executorch::backends::cuda::CudaBackend();
+  static auto cuda_backend = ::executorch::runtime::Backend{"CudaBackend", &cuda_backend_impl};
+  auto error = ::executorch::runtime::register_backend(cuda_backend);
+  if (error == ::executorch::runtime::Error::Ok) {
+    ET_LOG(Info, "Successfully registered CudaBackend");
+  } else {
+    ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)error);
+  }
   
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 

From daae0c0e1d20ddf31c418343479a684f2894200f Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 17:46:45 -0700
Subject: [PATCH 20/34] var name

---
 examples/models/voxtral/multimodal.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index c116c111154..ce8f5cc590a 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -292,11 +292,11 @@ int32_t main(int32_t argc, char** argv) {
   ET_LOG(Info, "Registering CUDA backend");
   static auto cuda_backend_impl = ::executorch::backends::cuda::CudaBackend();
   static auto cuda_backend = ::executorch::runtime::Backend{"CudaBackend", &cuda_backend_impl};
-  auto error = ::executorch::runtime::register_backend(cuda_backend);
-  if (error == ::executorch::runtime::Error::Ok) {
+  auto register_error = ::executorch::runtime::register_backend(cuda_backend);
+  if (register_error == ::executorch::runtime::Error::Ok) {
     ET_LOG(Info, "Successfully registered CudaBackend");
   } else {
-    ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)error);
+    ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)register_error);
   }
   
   gflags::ParseCommandLineFlags(&argc, &argv, true);

From b9d1a39dfc524ca4b1e901f317f536324abbd9d0 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 17:54:08 -0700
Subject: [PATCH 21/34] new approach

---
 backends/cuda/runtime/CudaBackend.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h
index ec42191227e..22dad0dad6e 100644
--- a/backends/cuda/runtime/CudaBackend.h
+++ b/backends/cuda/runtime/CudaBackend.h
@@ -2,13 +2,14 @@
 
 #pragma once
 
+#include <executorch/backends/cuda/runtime/export.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 
 namespace executorch::backends::cuda {
 
-class ET_EXPERIMENTAL CudaBackend final
+class AOTI_CUDA_EXPORT ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
   /**

From c0915f98492af094dffbf7fff25811acff76d1ed Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 17:57:39 -0700
Subject: [PATCH 22/34] test

---
 backends/cuda/runtime/CudaBackend.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h
index 22dad0dad6e..1c4fb70422b 100644
--- a/backends/cuda/runtime/CudaBackend.h
+++ b/backends/cuda/runtime/CudaBackend.h
@@ -2,6 +2,7 @@
 
 #pragma once
 
+#include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/cuda/runtime/export.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>

From 971bf499ac92d877f446b554bf98b129eb43f5eb Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 18:02:32 -0700
Subject: [PATCH 23/34] test

---
 backends/cuda/runtime/CudaBackend.h    | 9 ---------
 backends/cuda/runtime/cuda_backend.cpp | 2 +-
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h
index 1c4fb70422b..c15ac7ee8aa 100644
--- a/backends/cuda/runtime/CudaBackend.h
+++ b/backends/cuda/runtime/CudaBackend.h
@@ -2,7 +2,6 @@
 
 #pragma once
 
-#include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/cuda/runtime/export.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
@@ -12,14 +11,6 @@ namespace executorch::backends::cuda {
 
 class AOTI_CUDA_EXPORT ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
- private:
-  /**
-   * Load AOTI function pointers from the shared library into the handle.
-   */
-  ::executorch::runtime::Error load_function_pointers_into_handle(
-      void* so_handle,
-      struct AOTIDelegateHandle* handle) const;
-
  public:
   /**
    * Check if the CUDA backend is available.
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 5733b6c62e9..b0bd993a26e 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -48,7 +48,7 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
-Error CudaBackend::load_function_pointers_into_handle(
+Error load_function_pointers_into_handle(
     void* so_handle,
     AOTIDelegateHandle* handle) const {
 #define LOAD_SYMBOL(member, name)                                    \

From d1b26ceb59a2c4cce32e57697fe5967edd46bed5 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 18:04:28 -0700
Subject: [PATCH 24/34] remove const

---
 backends/cuda/runtime/cuda_backend.cpp | 38 +++++++++++++-------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index b0bd993a26e..c7d7b634799 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -48,9 +48,9 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
-Error load_function_pointers_into_handle(
+static Error load_function_pointers_into_handle(
     void* so_handle,
-    AOTIDelegateHandle* handle) const {
+    AOTIDelegateHandle* handle) {
 #define LOAD_SYMBOL(member, name)                                    \
   do {                                                               \
     auto symbol_res = get_function(so_handle, #name);                \
@@ -60,30 +60,30 @@ Error load_function_pointers_into_handle(
     handle->member = reinterpret_cast<name##Func>(symbol_res.get()); \
   } while (0)
 
-    LOAD_SYMBOL(create_with_device, AOTInductorModelContainerCreateWithDevice);
+  LOAD_SYMBOL(create_with_device, AOTInductorModelContainerCreateWithDevice);
 
-    LOAD_SYMBOL(delete_container, AOTInductorModelContainerDelete);
+  LOAD_SYMBOL(delete_container, AOTInductorModelContainerDelete);
 
-    LOAD_SYMBOL(get_num_inputs, AOTInductorModelContainerGetNumInputs);
+  LOAD_SYMBOL(get_num_inputs, AOTInductorModelContainerGetNumInputs);
 
-    LOAD_SYMBOL(get_num_outputs, AOTInductorModelContainerGetNumOutputs);
+  LOAD_SYMBOL(get_num_outputs, AOTInductorModelContainerGetNumOutputs);
 
-    LOAD_SYMBOL(run, AOTInductorModelContainerRun);
+  LOAD_SYMBOL(run, AOTInductorModelContainerRun);
 #undef LOAD_SYMBOL
 
-    auto symbol_res =
-        get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob");
-    if (symbol_res.ok()) {
-      handle->update_constants_from_blob =
-          reinterpret_cast<AOTInductorModelUpdateConstantsFromBlobFunc>(
-              symbol_res.get());
-    } else {
-      ET_LOG(
-          Info,
-          "Failed to load AOTInductorModelUpdateConstantsFromBlob. This .so is probably compiled on an old version of torch (<2.9.0)");
-    }
-    return Error::Ok;
+  auto symbol_res =
+      get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob");
+  if (symbol_res.ok()) {
+    handle->update_constants_from_blob =
+        reinterpret_cast<AOTInductorModelUpdateConstantsFromBlobFunc>(
+            symbol_res.get());
+  } else {
+    ET_LOG(
+        Info,
+        "Failed to load AOTInductorModelUpdateConstantsFromBlob. This .so is probably compiled on an old version of torch (<2.9.0)");
   }
+  return Error::Ok;
+}
 
 bool CudaBackend::is_available() const {
   return 1;

From 9dc28d933656f06f34dce49e27d6925a14aefac6 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 18:34:55 -0700
Subject: [PATCH 25/34] hacks

---
 CMakeLists.txt                 |  7 ++--
 backends/aoti/common_shims.cpp |  6 ++-
 backends/aoti/common_shims.h   | 67 +++++++++++++++++++++++-----------
 backends/cuda/CMakeLists.txt   |  3 +-
 4 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b96c12fbf3..bf5188acacc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,11 +99,10 @@ announce_configured_options(CCACHE_PROGRAM)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+# Force logging to always be enabled for this build.
 if(NOT EXECUTORCH_ENABLE_LOGGING)
-  # Avoid pulling in the logging strings, which can be large. Note that this
-  # will set the compiler flag for all targets in this directory, and for all
-  # subdirectories included after this point.
-  add_definitions(-DET_LOG_ENABLED=0)
+  message(STATUS "EXECUTORCH_ENABLE_LOGGING was OFF; forcing it to ON.")
+  set(EXECUTORCH_ENABLE_LOGGING ON CACHE BOOL "Build with ET_LOG_ENABLED" FORCE)
 endif()
 
 add_definitions(-DET_MIN_LOG_LEVEL=${ET_MIN_LOG_LEVEL})
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index ac87d49d5a5..eca0f91589b 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -16,8 +16,10 @@ namespace aoti {
 
 namespace internal {
 // Global storage for tensor metadata
-std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
-std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
+    tensor_to_sizes;
+AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
+    tensor_to_strides;
 } // namespace internal
 
 extern "C" {
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index 1b0429e3aba..dae712dfdb4 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -15,6 +15,13 @@
 #include <unordered_map>
 #include <vector>
 
+#if defined(BUILDING_CUDA_BACKEND)
+#include <executorch/backends/cuda/runtime/export.h>
+#define AOTI_SHIM_EXPORT AOTI_CUDA_EXPORT
+#else
+#define AOTI_SHIM_EXPORT
+#endif
+
 namespace executorch {
 namespace backends {
 namespace aoti {
@@ -30,52 +37,68 @@ using AOTIRuntimeError = Error;
 using AOTITorchError = Error;
 
 // Global storage for tensor metadata
-extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
-extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+extern AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
+    tensor_to_sizes;
+extern AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
+    tensor_to_strides;
 
 // Attribute-related operations (memory-irrelevant)
-AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_data_ptr(
+    Tensor* tensor,
+    void** ret_data_ptr);
 
-AOTITorchError aoti_torch_get_storage_offset(
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_storage_offset(
     Tensor* tensor,
     int64_t* ret_storage_offset);
 
-AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_strides(
+    Tensor* tensor,
+    int64_t** ret_strides);
 
-AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_dtype(
+    Tensor* tensor,
+    int32_t* ret_dtype);
 
-AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_sizes(
+    Tensor* tensor,
+    int64_t** ret_sizes);
 
-AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_storage_size(
+    Tensor* tensor,
+    int64_t* ret_size);
 
-AOTITorchError aoti_torch_get_device_index(
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_device_index(
     Tensor* tensor,
     int32_t* ret_device_index);
 
-AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_dim(
+    Tensor* tensor,
+    int64_t* ret_dim);
 
 // Utility functions for device and layout information
-int32_t aoti_torch_device_type_cpu();
-int32_t aoti_torch_layout_strided();
-int32_t aoti_torch_dtype_float32();
-int32_t aoti_torch_dtype_bfloat16();
-int32_t aoti_torch_dtype_int8();
-int32_t aoti_torch_dtype_int16();
-int32_t aoti_torch_dtype_int32();
-int32_t aoti_torch_dtype_int64();
+AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu();
+AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64();
 
 // Dtype utility function needed by Metal backend
-size_t aoti_torch_dtype_element_size(int32_t dtype);
+AOTI_SHIM_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype);
 
 // Autograd mode functions
-int32_t aoti_torch_grad_mode_is_enabled();
-void aoti_torch_grad_mode_set_enabled(bool enabled);
+AOTI_SHIM_EXPORT int32_t aoti_torch_grad_mode_is_enabled();
+AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
 
 // Cleanup functions for clearing global state
-void cleanup_tensor_metadata();
+AOTI_SHIM_EXPORT void cleanup_tensor_metadata();
 
 } // extern "C"
 
 } // namespace aoti
 } // namespace backends
 } // namespace executorch
+
+#undef AOTI_SHIM_EXPORT
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 9befbe5cc56..8cc71dbf437 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -43,6 +43,7 @@ set(_aoti_cuda_sources
     runtime/shims/cuda_guard.cpp
     runtime/shims/int4mm.cu
     runtime/platform/platform.cpp
+    ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
 )
 # Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC
 if(MSVC)
@@ -81,7 +82,7 @@ target_link_options(
 
 # Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
 target_link_libraries(
-  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda PUBLIC extension_tensor CUDA::cudart ${CMAKE_DL_LIBS}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)

From 8b4696434b3c24d39b4b1ecb791e9750de1413d5 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 18:40:36 -0700
Subject: [PATCH 26/34] move vectors

---
 backends/aoti/common_shims.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index dae712dfdb4..162f4d39b78 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -30,18 +30,16 @@ namespace aoti {
 using executorch::runtime::Error;
 using executorch::runtime::etensor::Tensor;
 
+// Global storage for tensor metadata
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+
 extern "C" {
 
 // Common AOTI type aliases
 using AOTIRuntimeError = Error;
 using AOTITorchError = Error;
 
-// Global storage for tensor metadata
-extern AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
-    tensor_to_sizes;
-extern AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
-    tensor_to_strides;
-
 // Attribute-related operations (memory-irrelevant)
 AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_data_ptr(
     Tensor* tensor,
@@ -100,5 +98,3 @@ AOTI_SHIM_EXPORT void cleanup_tensor_metadata();
 } // namespace aoti
 } // namespace backends
 } // namespace executorch
-
-#undef AOTI_SHIM_EXPORT

From cbacd6c4ed9af138e548a13a64cac43a10ec359f Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 19:38:27 -0700
Subject: [PATCH 27/34] lint

---
 CMakeLists.txt                                |   5 +-
 backends/aoti/common_shims.h                  |  40 +-
 backends/cuda/CMakeLists.txt                  |  20 +-
 backends/cuda/runtime/cuda_backend.cpp        | 484 +++++++++---------
 backends/cuda/runtime/export.h                |   9 +-
 backends/cuda/runtime/shims/cuda_guard.h      |  23 +-
 .../cuda/runtime/shims/tensor_attribute.h     |   7 +-
 examples/models/voxtral/multimodal.cpp        |  14 +-
 8 files changed, 301 insertions(+), 301 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf5188acacc..e04760f659d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -102,7 +102,10 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 # Force logging to always be enabled for this build.
 if(NOT EXECUTORCH_ENABLE_LOGGING)
   message(STATUS "EXECUTORCH_ENABLE_LOGGING was OFF; forcing it to ON.")
-  set(EXECUTORCH_ENABLE_LOGGING ON CACHE BOOL "Build with ET_LOG_ENABLED" FORCE)
+  set(EXECUTORCH_ENABLE_LOGGING
+      ON
+      CACHE BOOL "Build with ET_LOG_ENABLED" FORCE
+  )
 endif()
 
 add_definitions(-DET_MIN_LOG_LEVEL=${ET_MIN_LOG_LEVEL})
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index 162f4d39b78..2b3f32626e4 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -41,37 +41,29 @@ using AOTIRuntimeError = Error;
 using AOTITorchError = Error;
 
 // Attribute-related operations (memory-irrelevant)
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_data_ptr(
-    Tensor* tensor,
-    void** ret_data_ptr);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
 
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_storage_offset(
-    Tensor* tensor,
-    int64_t* ret_storage_offset);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_offset(Tensor* tensor, int64_t* ret_storage_offset);
 
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_strides(
-    Tensor* tensor,
-    int64_t** ret_strides);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
 
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_dtype(
-    Tensor* tensor,
-    int32_t* ret_dtype);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
 
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_sizes(
-    Tensor* tensor,
-    int64_t** ret_sizes);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
 
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_storage_size(
-    Tensor* tensor,
-    int64_t* ret_size);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
 
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_device_index(
-    Tensor* tensor,
-    int32_t* ret_device_index);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index);
 
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_dim(
-    Tensor* tensor,
-    int64_t* ret_dim);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
 
 // Utility functions for device and layout information
 AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu();
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 8cc71dbf437..3d0aa450b44 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -49,16 +49,18 @@ set(_aoti_cuda_sources
 if(MSVC)
   add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
   # Define export macros for Windows DLL
-  target_compile_definitions(aoti_cuda PRIVATE
-    EXPORT_AOTI_FUNCTIONS
-    BUILDING_CUDA_BACKEND
+  target_compile_definitions(
+    aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS BUILDING_CUDA_BACKEND
   )
-  # Ensure proper DLL import/export library naming on Windows with config-specific paths
-  set_target_properties(aoti_cuda PROPERTIES
-    WINDOWS_EXPORT_ALL_SYMBOLS OFF  # We use explicit exports via AOTI_CUDA_EXPORT
-    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$<CONFIG>
-    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
-    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
+  # Ensure proper DLL import/export library naming on Windows with
+  # config-specific paths
+  set_target_properties(
+    aoti_cuda
+    PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via
+                                              # AOTI_CUDA_EXPORT
+               RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$<CONFIG>
+               LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
+               ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
   )
 else()
   add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index c7d7b634799..029f7ad16bb 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -11,8 +11,8 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <cstdio>
 #include <cassert>
+#include <cstdio>
 
 #include <filesystem>
 #include <fstream>
@@ -20,9 +20,9 @@
 #include <vector>
 
 // Include class header and shim layer headers
-#include <executorch/backends/cuda/runtime/CudaBackend.h>
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/CudaBackend.h>
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
@@ -90,277 +90,277 @@ bool CudaBackend::is_available() const {
 }
 
 Result<DelegateHandle*> CudaBackend::init(
-      BackendInitContext& context,
-      FreeableBuffer* processed,
-      ArrayRef<CompileSpec> compile_specs) const {
-    std::string method_name;
-    for (const CompileSpec& spec : compile_specs) {
-      if (std::strcmp(spec.key, "method_name") == 0) {
-        method_name.assign(
-            static_cast<const char*>(spec.value.buffer),
-            spec.value.nbytes); // no nullptr guarantee, so pass size
-        break;
-      }
+    BackendInitContext& context,
+    FreeableBuffer* processed,
+    ArrayRef<CompileSpec> compile_specs) const {
+  std::string method_name;
+  for (const CompileSpec& spec : compile_specs) {
+    if (std::strcmp(spec.key, "method_name") == 0) {
+      method_name.assign(
+          static_cast<const char*>(spec.value.buffer),
+          spec.value.nbytes); // no nullptr guarantee, so pass size
+      break;
     }
+  }
 
-    std::string so_blob_key =
-        method_name.empty() ? "so_blob" : method_name + "_so_blob";
-
-    const NamedDataMap* named_data_map = context.get_named_data_map();
-    auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str());
-    ET_CHECK_OR_RETURN_ERROR(
-        aoti_dso_buffer.ok(),
-        Internal,
-        "Failed to get data for key %s: 0x%x",
-        so_blob_key.c_str(),
-        static_cast<uint32_t>(aoti_dso_buffer.error()));
-
-    // Generate dynamic temporary file path
-    filesystem::path temp_dir = filesystem::temp_directory_path();
-    filesystem::path so_path =
-        temp_dir / (so_blob_key + to_string(get_process_id()) + ".so");
-
-    // Create a temporary file
-    ofstream outfile(so_path, ios::binary);
-
-    // Write the ELF buffer to the temporary file
-    ET_LOG(
-        Info,
-        "Writing %zu bytes to %s",
-        aoti_dso_buffer->size(),
-        so_path.c_str());
-
-    outfile.write(
-        static_cast<const char*>(aoti_dso_buffer->data()),
-        aoti_dso_buffer->size());
+  std::string so_blob_key =
+      method_name.empty() ? "so_blob" : method_name + "_so_blob";
+
+  const NamedDataMap* named_data_map = context.get_named_data_map();
+  auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str());
+  ET_CHECK_OR_RETURN_ERROR(
+      aoti_dso_buffer.ok(),
+      Internal,
+      "Failed to get data for key %s: 0x%x",
+      so_blob_key.c_str(),
+      static_cast<uint32_t>(aoti_dso_buffer.error()));
+
+  // Generate dynamic temporary file path
+  filesystem::path temp_dir = filesystem::temp_directory_path();
+  filesystem::path so_path =
+      temp_dir / (so_blob_key + to_string(get_process_id()) + ".so");
+
+  // Create a temporary file
+  ofstream outfile(so_path, ios::binary);
+
+  // Write the ELF buffer to the temporary file
+  ET_LOG(
+      Info,
+      "Writing %zu bytes to %s",
+      aoti_dso_buffer->size(),
+      so_path.c_str());
+
+  outfile.write(
+      static_cast<const char*>(aoti_dso_buffer->data()),
+      aoti_dso_buffer->size());
+
+  ET_CHECK_OR_RETURN_ERROR(
+      outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
+
+  // Finish writing the file to disk
+  outfile.close();
+
+  // Free the buffer immediately after writing to disk
+  aoti_dso_buffer->Free();
+  // Load the lib
+  Result<void*> lib_handle_res = load_library(so_path);
+  if (!lib_handle_res.ok()) {
+    return lib_handle_res.error();
+  }
+  void* lib_handle = lib_handle_res.get();
 
-    ET_CHECK_OR_RETURN_ERROR(
-        outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
+  processed->Free();
 
-    // Finish writing the file to disk
-    outfile.close();
+  // Create handle and load function pointers into it
+  AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+  handle->so_handle = lib_handle;
+  handle->so_path = so_path.string();
 
-    // Free the buffer immediately after writing to disk
-    aoti_dso_buffer->Free();
-    // Load the lib
-    Result<void*> lib_handle_res = load_library(so_path);
-    if (!lib_handle_res.ok()) {
-      return lib_handle_res.error();
-    }
-    void* lib_handle = lib_handle_res.get();
+  // Load function pointers specific to this handle's shared library
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      load_function_pointers_into_handle(lib_handle, handle));
 
-    processed->Free();
+  AOTInductorModelContainerHandle container_handle = nullptr;
 
-    // Create handle and load function pointers into it
-    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
-    handle->so_handle = lib_handle;
-    handle->so_path = so_path.string();
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      handle->create_with_device(&container_handle, 1, "cuda", nullptr));
 
-    // Load function pointers specific to this handle's shared library
-    ET_CHECK_OK_OR_RETURN_ERROR(
-        load_function_pointers_into_handle(lib_handle, handle));
+  ET_LOG(Info, "container_handle = %p", container_handle);
 
-    AOTInductorModelContainerHandle container_handle = nullptr;
+  handle->container_handle = container_handle;
 
-    ET_CHECK_OK_OR_RETURN_ERROR(
-        handle->create_with_device(&container_handle, 1, "cuda", nullptr));
-
-    ET_LOG(Info, "container_handle = %p", container_handle);
-
-    handle->container_handle = container_handle;
-
-    // Look into named data map for constant data
-    std::string weights_blob_key =
-        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
-    auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
-    if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
-      ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
-      const void* weights_blob = buffer_res->data();
-      // Feed the weights blob into the container. Under the hood it's copying
-      // weights, so we should free the buffer immediately.
-      ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob(
-          handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
-      buffer_res->Free();
-    }
-    // Create a CUDA stream for asynchronous execution
-    cudaStream_t cuda_stream;
-    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
-    handle->cuda_stream = static_cast<void*>(cuda_stream);
-
-    return (DelegateHandle*)handle; // Return the handle post-processing
+  // Look into named data map for constant data
+  std::string weights_blob_key =
+      method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
+  auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
+  if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
+    ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
+    const void* weights_blob = buffer_res->data();
+    // Feed the weights blob into the container. Under the hood it's copying
+    // weights, so we should free the buffer immediately.
+    ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob(
+        handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
+    buffer_res->Free();
   }
+  // Create a CUDA stream for asynchronous execution
+  cudaStream_t cuda_stream;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
+  handle->cuda_stream = static_cast<void*>(cuda_stream);
+
+  return (DelegateHandle*)handle; // Return the handle post-processing
+}
 
 Error CudaBackend::execute(
     BackendExecutionContext& context,
     DelegateHandle* handle_,
     Span<EValue*> args) const {
-    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+  AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+  size_t n_inputs;
+  handle->get_num_inputs(handle->container_handle, &n_inputs);
+
+  size_t n_outputs;
+  handle->get_num_outputs(handle->container_handle, &n_outputs);
+
+  ET_CHECK_OR_RETURN_ERROR(
+      n_inputs + n_outputs == args.size(),
+      InvalidArgument,
+      "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+      n_inputs,
+      n_outputs,
+      args.size())
+
+  // NOTE: ExecuTorch tensors are always on CPU/host memory
+  // We need to create GPU copies for CUDA kernel execution
+  std::vector<AOTITensorHandle> gpu_inputs(
+      n_inputs); // GPU copies for kernel execution
+  std::vector<AOTITensorHandle> gpu_outputs(
+      n_outputs); // GPU tensors for kernel output
+
+  // Process input tensors: ExecuTorch provides CPU tensors, create GPU
+  // copies
+  for (int i = 0; i < n_inputs; i++) {
+    // Get tensor dimensions and properties from ExecuTorch CPU tensor
+    auto cpu_tensor = &(args[i]->toTensor());
+    auto sizes = cpu_tensor->sizes();
+    auto scalar_type = cpu_tensor->scalar_type();
+
+    // Create GPU tensor with same shape
+    std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+    AOTITensorHandle gpu_input_handle;
+    Error create_err = aoti_torch_empty_strided(
+        sizes_vec.size(),
+        sizes_vec.data(),
+        nullptr, // use default strides
+        static_cast<int32_t>(scalar_type),
+        1, // device_type = cuda
+        0, // device_index = 0
+        &gpu_input_handle);
 
-    size_t n_inputs;
-    handle->get_num_inputs(handle->container_handle, &n_inputs);
+    ET_CHECK_OR_RETURN_ERROR(
+        create_err == Error::Ok,
+        Internal,
+        "Failed to create GPU tensor for input %d",
+        i);
 
-    size_t n_outputs;
-    handle->get_num_outputs(handle->container_handle, &n_outputs);
+    gpu_inputs[i] = gpu_input_handle;
 
+    // Copy data from CPU to GPU
     ET_CHECK_OR_RETURN_ERROR(
-        n_inputs + n_outputs == args.size(),
-        InvalidArgument,
-        "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
-        n_inputs,
-        n_outputs,
-        args.size())
-
-    // NOTE: ExecuTorch tensors are always on CPU/host memory
-    // We need to create GPU copies for CUDA kernel execution
-    std::vector<AOTITensorHandle> gpu_inputs(
-        n_inputs); // GPU copies for kernel execution
-    std::vector<AOTITensorHandle> gpu_outputs(
-        n_outputs); // GPU tensors for kernel output
-
-    // Process input tensors: ExecuTorch provides CPU tensors, create GPU
-    // copies
-    for (int i = 0; i < n_inputs; i++) {
-      // Get tensor dimensions and properties from ExecuTorch CPU tensor
-      auto cpu_tensor = &(args[i]->toTensor());
-      auto sizes = cpu_tensor->sizes();
-      auto scalar_type = cpu_tensor->scalar_type();
-
-      // Create GPU tensor with same shape
-      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
-
-      AOTITensorHandle gpu_input_handle;
-      Error create_err = aoti_torch_empty_strided(
-          sizes_vec.size(),
-          sizes_vec.data(),
-          nullptr, // use default strides
-          static_cast<int32_t>(scalar_type),
-          1, // device_type = cuda
-          0, // device_index = 0
-          &gpu_input_handle);
-
-      ET_CHECK_OR_RETURN_ERROR(
-          create_err == Error::Ok,
-          Internal,
-          "Failed to create GPU tensor for input %d",
-          i);
-
-      gpu_inputs[i] = gpu_input_handle;
-
-      // Copy data from CPU to GPU
-      ET_CHECK_OR_RETURN_ERROR(
-          aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
-          Internal,
-          "Failed to copy input %d from CPU to GPU",
-          i);
-    }
-    // Process output tensors: create GPU counterparts for ExecuTorch CPU
-    // tensors
-    for (int i = 0; i < n_outputs; i++) {
-      // Get output tensor dimensions from ExecuTorch CPU tensor
-      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
-      auto sizes = cpu_output_tensor->sizes();
-      auto scalar_type = cpu_output_tensor->scalar_type();
-
-      // Create GPU tensor with same shape for kernel output
-      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
-
-      AOTITensorHandle gpu_output_handle;
-      Error create_err = aoti_torch_empty_strided(
-          sizes_vec.size(),
-          sizes_vec.data(),
-          nullptr, // use default strides
-          static_cast<int32_t>(scalar_type),
-          1, // device_type = cuda
-          0, // device_index = 0
-          &gpu_output_handle);
-
-      ET_CHECK_OR_RETURN_ERROR(
-          create_err == Error::Ok,
-          Internal,
-          "Failed to create GPU tensor for output %d",
-          i);
-
-      gpu_outputs[i] = gpu_output_handle;
-    }
-    // Run AOTI container with GPU tensors
-    AOTIRuntimeError error = handle->run(
-        handle->container_handle,
-        gpu_inputs.data(), // Use GPU input tensors
-        n_inputs,
-        gpu_outputs.data(), // Use GPU output tensors
-        n_outputs,
-        handle->cuda_stream, // Pass the actual CUDA stream
-        nullptr); // proxy_executor_handle can remain nullptr
+        aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
+        Internal,
+        "Failed to copy input %d from CPU to GPU",
+        i);
+  }
+  // Process output tensors: create GPU counterparts for ExecuTorch CPU
+  // tensors
+  for (int i = 0; i < n_outputs; i++) {
+    // Get output tensor dimensions from ExecuTorch CPU tensor
+    auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+    auto sizes = cpu_output_tensor->sizes();
+    auto scalar_type = cpu_output_tensor->scalar_type();
+
+    // Create GPU tensor with same shape for kernel output
+    std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+    AOTITensorHandle gpu_output_handle;
+    Error create_err = aoti_torch_empty_strided(
+        sizes_vec.size(),
+        sizes_vec.data(),
+        nullptr, // use default strides
+        static_cast<int32_t>(scalar_type),
+        1, // device_type = cuda
+        0, // device_index = 0
+        &gpu_output_handle);
 
     ET_CHECK_OR_RETURN_ERROR(
-        error == Error::Ok,
+        create_err == Error::Ok,
         Internal,
-        "AOTInductorModelContainerRun failed with error code %d",
-        error);
-
-    // Copy GPU output results back to CPU output tensors
-    for (int i = 0; i < n_outputs; i++) {
-      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
-      // For DYNAMIC_BOUND tensors we try to resize
-      ET_CHECK_OK_OR_RETURN_ERROR(
-          resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
-          "Error resizing tensor at output index %d",
-          i);
-      ET_CHECK_OK_OR_RETURN_ERROR(
-          aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
-          "Failed to copy GPU output %d back to CPU",
-          i);
-    }
+        "Failed to create GPU tensor for output %d",
+        i);
 
-    return Error::Ok;
+    gpu_outputs[i] = gpu_output_handle;
+  }
+  // Run AOTI container with GPU tensors
+  AOTIRuntimeError error = handle->run(
+      handle->container_handle,
+      gpu_inputs.data(), // Use GPU input tensors
+      n_inputs,
+      gpu_outputs.data(), // Use GPU output tensors
+      n_outputs,
+      handle->cuda_stream, // Pass the actual CUDA stream
+      nullptr); // proxy_executor_handle can remain nullptr
+
+  ET_CHECK_OR_RETURN_ERROR(
+      error == Error::Ok,
+      Internal,
+      "AOTInductorModelContainerRun failed with error code %d",
+      error);
+
+  // Copy GPU output results back to CPU output tensors
+  for (int i = 0; i < n_outputs; i++) {
+    auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+    // For DYNAMIC_BOUND tensors we try to resize
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
+        "Error resizing tensor at output index %d",
+        i);
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
+        "Failed to copy GPU output %d back to CPU",
+        i);
   }
 
-void CudaBackend::destroy(DelegateHandle* handle_) const {
-    if (handle_ == nullptr) {
-      return;
-    }
-    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
-
-    // Destroy the CUDA stream if it exists
-    if (handle->cuda_stream != nullptr) {
-      cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
-      cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
-      ET_CHECK_OR_LOG_ERROR(
-          stream_err == cudaSuccess,
-          "Failed to destroy CUDA stream: %s",
-          cudaGetErrorString(stream_err));
-      handle->cuda_stream = nullptr;
-    }
+  return Error::Ok;
+}
 
-    // NOTE: AOTInductorModelContainerDelete does not work correctly with
-    // multiple .so files. Deleting one container frees shared resources,
-    // which causes segmentation faults when attempting to delete other
-    // containers. As a workaround, we skip explicit container deletion
-    // and defer cleanup to the OS.
-    // TODO(gasoonjia): Find a proper solution for safe container deletion.
-    // AOTInductorModelContainerDelete(handle->container_handle);
-
-    // Now close the shared library
-    auto err = Error::Ok;
-    if (handle->so_handle != nullptr) {
-      err = close_library(handle->so_handle);
-    }
+void CudaBackend::destroy(DelegateHandle* handle_) const {
+  if (handle_ == nullptr) {
+    return;
+  }
+  AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+  // Destroy the CUDA stream if it exists
+  if (handle->cuda_stream != nullptr) {
+    cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
+    cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
+    ET_CHECK_OR_LOG_ERROR(
+        stream_err == cudaSuccess,
+        "Failed to destroy CUDA stream: %s",
+        cudaGetErrorString(stream_err));
+    handle->cuda_stream = nullptr;
+  }
 
-    // Remove the temporary shared library file
-    if (!handle->so_path.empty()) {
-      std::error_code remove_error;
-      std::filesystem::remove(handle->so_path, remove_error);
-      ET_CHECK_OR_LOG_ERROR(
-          !remove_error,
-          "Failed to remove temporary shared library %s: %s",
-          handle->so_path.c_str(),
-          remove_error.message().c_str());
-    }
+  // NOTE: AOTInductorModelContainerDelete does not work correctly with
+  // multiple .so files. Deleting one container frees shared resources,
+  // which causes segmentation faults when attempting to delete other
+  // containers. As a workaround, we skip explicit container deletion
+  // and defer cleanup to the OS.
+  // TODO(gasoonjia): Find a proper solution for safe container deletion.
+  // AOTInductorModelContainerDelete(handle->container_handle);
+
+  // Now close the shared library
+  auto err = Error::Ok;
+  if (handle->so_handle != nullptr) {
+    err = close_library(handle->so_handle);
+  }
 
-    delete handle;
-    clear_all_tensors();
+  // Remove the temporary shared library file
+  if (!handle->so_path.empty()) {
+    std::error_code remove_error;
+    std::filesystem::remove(handle->so_path, remove_error);
+    ET_CHECK_OR_LOG_ERROR(
+        !remove_error,
+        "Failed to remove temporary shared library %s: %s",
+        handle->so_path.c_str(),
+        remove_error.message().c_str());
   }
 
+  delete handle;
+  clear_all_tensors();
+}
+
 } // namespace executorch::backends::cuda
 
 namespace executorch::backends {
diff --git a/backends/cuda/runtime/export.h b/backends/cuda/runtime/export.h
index 23623888e85..65fe7192611 100644
--- a/backends/cuda/runtime/export.h
+++ b/backends/cuda/runtime/export.h
@@ -9,10 +9,11 @@
 #pragma once
 
 // Define export macro for Windows DLL
-// When building the aoti_cuda library, EXPORT_AOTI_FUNCTIONS is defined by CMake,
-// which causes this macro to export symbols using __declspec(dllexport).
-// When consuming the library, the macro imports symbols using __declspec(dllimport).
-// On non-Windows platforms, the macro is empty and has no effect.
+// When building the aoti_cuda library, EXPORT_AOTI_FUNCTIONS is defined by
+// CMake, which causes this macro to export symbols using __declspec(dllexport).
+// When consuming the library, the macro imports symbols using
+// __declspec(dllimport). On non-Windows platforms, the macro is empty and has
+// no effect.
 #ifdef _WIN32
 #ifdef EXPORT_AOTI_FUNCTIONS
 #define AOTI_CUDA_EXPORT __declspec(dllexport)
diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h
index 932ce6ab785..ec2b381f524 100644
--- a/backends/cuda/runtime/shims/cuda_guard.h
+++ b/backends/cuda/runtime/shims/cuda_guard.h
@@ -10,8 +10,8 @@
 
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/common_shims.h>
-#include <executorch/backends/cuda/runtime/guard.h>
 #include <executorch/backends/cuda/runtime/export.h>
+#include <executorch/backends/cuda/runtime/guard.h>
 #include <cstdint>
 
 namespace executorch::backends::cuda {
@@ -34,9 +34,8 @@ using CUDAStreamGuardHandle = CUDAStreamGuard*;
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_guard(
-    int32_t device_index,
-    CUDAGuardHandle* ret_guard);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_create_cuda_guard(int32_t device_index, CUDAGuardHandle* ret_guard);
 
 /**
  * Deletes a CUDA device guard and frees its associated resources.
@@ -45,7 +44,8 @@ AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_guard(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
 
 /**
  * Sets the CUDA device to a new index for an existing guard.
@@ -55,9 +55,8 @@ AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle gua
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTI_CUDA_EXPORT AOTITorchError aoti_torch_cuda_guard_set_index(
-    CUDAGuardHandle guard,
-    int32_t device_index);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index);
 
 /**
  * Creates a CUDA stream guard that sets the current device and stream,
@@ -82,7 +81,8 @@ AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
 
 /**
  * Gets the current CUDA stream for a specified device.
@@ -92,9 +92,8 @@ AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGu
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTI_CUDA_EXPORT AOTITorchError aoti_torch_get_current_cuda_stream(
-    int32_t device_index,
-    void** ret_stream);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream);
 
 } // extern "C"
 
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
index 8b4950a154d..3c881df63b3 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.h
+++ b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -8,9 +8,9 @@
 
 #pragma once
 
+#include <executorch/backends/cuda/runtime/export.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
-#include <executorch/backends/cuda/runtime/export.h>
 #include <cstdint>
 
 namespace executorch::backends::cuda {
@@ -25,9 +25,8 @@ extern "C" {
 using AOTITorchError = Error;
 
 // Device type functions for tensor attributes
-AOTI_CUDA_EXPORT AOTITorchError aoti_torch_get_device_type(
-    Tensor* tensor,
-    int32_t* ret_device_type);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_get_device_type(Tensor* tensor, int32_t* ret_device_type);
 
 // Device type constants
 AOTI_CUDA_EXPORT int32_t aoti_torch_device_type_cuda();
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index ce8f5cc590a..ff483cce516 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -286,19 +286,23 @@ MultimodalInput processAudioFile(
 
 } // namespace
 
-
 int32_t main(int32_t argc, char** argv) {
-  // Manually register the CUDA backend (required on Windows, harmless on other platforms)
+  // Manually register the CUDA backend (required on Windows, harmless on other
+  // platforms)
   ET_LOG(Info, "Registering CUDA backend");
   static auto cuda_backend_impl = ::executorch::backends::cuda::CudaBackend();
-  static auto cuda_backend = ::executorch::runtime::Backend{"CudaBackend", &cuda_backend_impl};
+  static auto cuda_backend =
+      ::executorch::runtime::Backend{"CudaBackend", &cuda_backend_impl};
   auto register_error = ::executorch::runtime::register_backend(cuda_backend);
   if (register_error == ::executorch::runtime::Error::Ok) {
     ET_LOG(Info, "Successfully registered CudaBackend");
   } else {
-    ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)register_error);
+    ET_LOG(
+        Error,
+        "Failed to register CudaBackend: error code %d",
+        (int)register_error);
   }
-  
+
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   const char* model_path = FLAGS_model_path.c_str();

From bec6c5fca1cabeab0036139c05bfa4019d640b7d Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 21:04:13 -0700
Subject: [PATCH 28/34] stub

---
 backends/aoti/common_shims.cpp | 53 +++++++++++++++++++++++++---------
 backends/aoti/common_shims.h   | 10 +++++++
 2 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index eca0f91589b..7b1728bf6e4 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
 
@@ -14,13 +15,11 @@ namespace executorch {
 namespace backends {
 namespace aoti {
 
-namespace internal {
 // Global storage for tensor metadata
 AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
     tensor_to_sizes;
 AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
     tensor_to_strides;
-} // namespace internal
 
 extern "C" {
 
@@ -52,10 +51,10 @@ AOTITorchError aoti_torch_get_storage_offset(
 }
 
 AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
-  auto it = internal::tensor_to_strides.find(tensor);
+  auto it = tensor_to_strides.find(tensor);
   bool needs_update = false;
 
-  if (it == internal::tensor_to_strides.end()) {
+  if (it == tensor_to_strides.end()) {
     needs_update = true;
   } else {
     // CRITICAL: Multimodal models reuse tensors with different shapes across
@@ -76,9 +75,7 @@ AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
     for (int i = 0; i < tensor->dim(); i++) {
       strides[i] = tensor_strides[i];
     }
-    it =
-        internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
-            .first;
+    it = tensor_to_strides.insert_or_assign(tensor, std::move(strides)).first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -100,10 +97,10 @@ AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
 }
 
 AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
-  auto it = internal::tensor_to_sizes.find(tensor);
+  auto it = tensor_to_sizes.find(tensor);
   bool needs_update = false;
 
-  if (it == internal::tensor_to_sizes.end()) {
+  if (it == tensor_to_sizes.end()) {
     needs_update = true;
   } else {
     // CRITICAL: Multimodal models reuse tensors with different shapes across
@@ -124,8 +121,7 @@ AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
     for (int i = 0; i < tensor->dim(); i++) {
       sizes[i] = tensor_sizes[i];
     }
-    it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
-             .first;
+    it = tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)).first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -198,8 +194,39 @@ size_t aoti_torch_dtype_element_size(int32_t dtype) {
 
 // Cleanup functions
 void cleanup_tensor_metadata() {
-  internal::tensor_to_sizes.clear();
-  internal::tensor_to_strides.clear();
+  tensor_to_sizes.clear();
+  tensor_to_strides.clear();
+}
+
+void aoti_torch_warn(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg) {
+  ET_LOG(Warning, "[%s:%u] %s: %s", file, line, func, msg);
+}
+
+AOTITorchError aoti_torch_clone_preserve_strides(
+    Tensor* self,
+    Tensor** ret_new_tensor) {
+  ET_CHECK_OR_RETURN_ERROR(
+      self != nullptr,
+      InvalidArgument,
+      "aoti_torch_clone_preserve_strides failed: self tensor is null");
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_new_tensor != nullptr,
+      InvalidArgument,
+      "aoti_torch_clone_preserve_strides failed: ret_new_tensor is null");
+
+  auto cloned = executorch::extension::clone_tensor_ptr(*self);
+  ET_CHECK_OR_RETURN_ERROR(
+      cloned != nullptr,
+      InvalidArgument,
+      "aoti_torch_clone_preserve_strides failed: tensor clone returned null");
+
+  *ret_new_tensor = new Tensor(*cloned);
+
+  return Error::Ok;
 }
 
 } // extern "C"
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index 2b3f32626e4..e95b2b74cbc 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -85,6 +85,16 @@ AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
 // Cleanup functions for clearing global state
 AOTI_SHIM_EXPORT void cleanup_tensor_metadata();
 
+AOTI_SHIM_EXPORT void aoti_torch_warn(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_clone_preserve_strides(
+    Tensor* self,
+    Tensor** ret_new_tensor);
+
 } // extern "C"
 
 } // namespace aoti

From d8fafd34ef83c50e7f3bb81a11f9bda20951a203 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 21:13:39 -0700
Subject: [PATCH 29/34] stubbed shims

---
 backends/aoti/common_shims.cpp | 72 ++++++++++++++++++++++++----------
 backends/aoti/common_shims.h   | 23 ++++++++++-
 2 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index 7b1728bf6e4..e8c56a45946 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -206,27 +206,59 @@ void aoti_torch_warn(
   ET_LOG(Warning, "[%s:%u] %s: %s", file, line, func, msg);
 }
 
-AOTITorchError aoti_torch_clone_preserve_strides(
-    Tensor* self,
-    Tensor** ret_new_tensor) {
-  ET_CHECK_OR_RETURN_ERROR(
-      self != nullptr,
-      InvalidArgument,
-      "aoti_torch_clone_preserve_strides failed: self tensor is null");
-  ET_CHECK_OR_RETURN_ERROR(
-      ret_new_tensor != nullptr,
-      InvalidArgument,
-      "aoti_torch_clone_preserve_strides failed: ret_new_tensor is null");
-
-  auto cloned = executorch::extension::clone_tensor_ptr(*self);
-  ET_CHECK_OR_RETURN_ERROR(
-      cloned != nullptr,
-      InvalidArgument,
-      "aoti_torch_clone_preserve_strides failed: tensor clone returned null");
-
-  *ret_new_tensor = new Tensor(*cloned);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) {
+  (void)tensor;
+  (void)ret_size;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
 
-  return Error::Ok;
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor) {
+  (void)self;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) {
+  (void)self;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle) {
+  (void)orig_handle;
+  (void)new_handle;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data_ptr,
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor) {
+  (void)data_ptr;
+  (void)ndim;
+  (void)sizes;
+  (void)strides;
+  (void)storage_offset;
+  (void)dtype;
+  (void)device_type;
+  (void)device_index;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
 }
 
 } // extern "C"
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index e95b2b74cbc..40849a9d5af 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -91,8 +91,27 @@ AOTI_SHIM_EXPORT void aoti_torch_warn(
     uint32_t line,
     const char* msg);
 
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_clone_preserve_strides(
-    Tensor* self,
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle);
+
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data_ptr,
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
     Tensor** ret_new_tensor);
 
 } // extern "C"

From d0b90342373252d8488d614b15f438d5ef41455b Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 21:24:12 -0700
Subject: [PATCH 30/34] undo internal change

---
 backends/aoti/common_shims.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index e8c56a45946..d6e340d99b8 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/backends/aoti/common_shims.h>
-#include <executorch/extension/tensor/tensor_ptr.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
 
@@ -15,11 +14,13 @@ namespace executorch {
 namespace backends {
 namespace aoti {
 
+namespace internal {
 // Global storage for tensor metadata
 AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
     tensor_to_sizes;
 AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
     tensor_to_strides;
+} // namespace internal
 
 extern "C" {
 
@@ -51,10 +52,10 @@ AOTITorchError aoti_torch_get_storage_offset(
 }
 
 AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
-  auto it = tensor_to_strides.find(tensor);
+  auto it = internal::tensor_to_strides.find(tensor);
   bool needs_update = false;
 
-  if (it == tensor_to_strides.end()) {
+  if (it == internal::tensor_to_strides.end()) {
     needs_update = true;
   } else {
     // CRITICAL: Multimodal models reuse tensors with different shapes across
@@ -75,7 +76,7 @@ AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
     for (int i = 0; i < tensor->dim(); i++) {
       strides[i] = tensor_strides[i];
     }
-    it = tensor_to_strides.insert_or_assign(tensor, std::move(strides)).first;
+    it = internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides)).first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -97,10 +98,10 @@ AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
 }
 
 AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
-  auto it = tensor_to_sizes.find(tensor);
+  auto it = internal::tensor_to_sizes.find(tensor);
   bool needs_update = false;
 
-  if (it == tensor_to_sizes.end()) {
+  if (it == internal::tensor_to_sizes.end()) {
     needs_update = true;
   } else {
     // CRITICAL: Multimodal models reuse tensors with different shapes across
@@ -121,7 +122,7 @@ AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
     for (int i = 0; i < tensor->dim(); i++) {
       sizes[i] = tensor_sizes[i];
     }
-    it = tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)).first;
+    it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)).first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -194,8 +195,8 @@ size_t aoti_torch_dtype_element_size(int32_t dtype) {
 
 // Cleanup functions
 void cleanup_tensor_metadata() {
-  tensor_to_sizes.clear();
-  tensor_to_strides.clear();
+  internal::tensor_to_sizes.clear();
+  internal::tensor_to_strides.clear();
 }
 
 void aoti_torch_warn(

From e44d74b95dccd38da3b57a61fee05612d6f055ac Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 21:25:39 -0700
Subject: [PATCH 31/34] we dont have warn level

---
 backends/aoti/common_shims.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index d6e340d99b8..5624f5a4aa0 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -204,7 +204,7 @@ void aoti_torch_warn(
     const char* file,
     uint32_t line,
     const char* msg) {
-  ET_LOG(Warning, "[%s:%u] %s: %s", file, line, func, msg);
+  ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg);
 }
 
 AOTI_SHIM_EXPORT AOTITorchError

From ae6157531660e74016e994b9dd186b58c71a779c Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 22:01:56 -0700
Subject: [PATCH 32/34] hacky mmap change

---
 extension/data_loader/mmap_data_loader.cpp | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp
index 10bd2f35f5e..541fc36bfa7 100644
--- a/extension/data_loader/mmap_data_loader.cpp
+++ b/extension/data_loader/mmap_data_loader.cpp
@@ -94,8 +94,13 @@ Result<MmapDataLoader> MmapDataLoader::from(
   }
 
   // Cache the file size.
+#if defined(_WIN32)
+  struct _stat64 st;
+  int err = ::_fstat64(fd, &st);
+#else
   struct stat st;
   int err = ::fstat(fd, &st);
+#endif
   if (err < 0) {
     ET_LOG(
         Error,
@@ -106,7 +111,15 @@ Result<MmapDataLoader> MmapDataLoader::from(
     ::close(fd);
     return Error::AccessFailed;
   }
-  size_t file_size = st.st_size;
+
+  uint64_t file_size_u64 = static_cast<uint64_t>(st.st_size);
+  ET_CHECK_OR_RETURN_ERROR(
+      file_size_u64 <= std::numeric_limits<size_t>::max(),
+      NotSupported,
+      "File %s is too large (%llu bytes) for current platform",
+      file_name,
+      static_cast<unsigned long long>(file_size_u64));
+  size_t file_size = static_cast<size_t>(file_size_u64);
 
   // Copy the filename so we can print better debug messages if reads fail.
   const char* file_name_copy = ::strdup(file_name);

From 1d661c42188c17ad9a700ff6d8a59033e818d6bc Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 22:11:51 -0700
Subject: [PATCH 33/34] more hacky mmap edits

---
 extension/data_loader/mman_windows.cpp     | 55 ++++++++++------------
 extension/data_loader/mman_windows.h       |  9 +++-
 extension/data_loader/mmap_data_loader.cpp | 23 +++++----
 3 files changed, 48 insertions(+), 39 deletions(-)

diff --git a/extension/data_loader/mman_windows.cpp b/extension/data_loader/mman_windows.cpp
index 89f9f22f467..847b2aefadd 100644
--- a/extension/data_loader/mman_windows.cpp
+++ b/extension/data_loader/mman_windows.cpp
@@ -21,6 +21,8 @@
 
 #include <errno.h>
 #include <io.h>
+#include <limits>
+#include <cstdint>
 #include <windows.h>
 
 #ifndef STATUS_SECTION_TOO_BIG
@@ -129,49 +131,42 @@ static DWORD __map_mmap_prot_file(const int prot) {
 
 } // namespace
 
-void* mmap(void* addr, size_t len, int prot, int flags, int fildes, off_t off) {
+void* mmap(
+    void* addr,
+    size_t len,
+    int prot,
+    int flags,
+    int fildes,
+    std::uint64_t off) {
   HANDLE fm, h;
-
   void* map = MAP_FAILED;
 
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4293)
-#endif
-
-  const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD))
-      ? (DWORD)off
-      : (DWORD)(off & 0xFFFFFFFFL);
-  const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD))
-      ? (DWORD)0
-      : (DWORD)((off >> 32) & 0xFFFFFFFFL);
-  const DWORD protect = __map_mmap_prot_page(prot);
-  const DWORD desiredAccess = __map_mmap_prot_file(prot);
-
-  const off_t maxSize = off + (off_t)len;
-
-  const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD))
-      ? (DWORD)maxSize
-      : (DWORD)(maxSize & 0xFFFFFFFFL);
-  const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD))
-      ? (DWORD)0
-      : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL);
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
   errno = 0;
 
   if (len == 0
       /* Unsupported flag combinations */
       || (flags & MAP_FIXED) != 0
-      /* Usupported protection combinations */
+      /* Unsupported protection combinations */
       || prot == PROT_EXEC) {
     errno = EINVAL;
     return MAP_FAILED;
   }
 
+  if (off > std::numeric_limits<std::uint64_t>::max() - len) {
+    errno = EINVAL;
+    return MAP_FAILED;
+  }
+
+  const std::uint64_t maxSize = off + static_cast<std::uint64_t>(len);
+
+  const DWORD dwFileOffsetLow = static_cast<DWORD>(off & 0xFFFFFFFFULL);
+  const DWORD dwFileOffsetHigh = static_cast<DWORD>((off >> 32) & 0xFFFFFFFFULL);
+  const DWORD protect = __map_mmap_prot_page(prot);
+  const DWORD desiredAccess = __map_mmap_prot_file(prot);
+
+  const DWORD dwMaxSizeLow = static_cast<DWORD>(maxSize & 0xFFFFFFFFULL);
+  const DWORD dwMaxSizeHigh = static_cast<DWORD>((maxSize >> 32) & 0xFFFFFFFFULL);
+
   h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes)
                                      : INVALID_HANDLE_VALUE;
 
diff --git a/extension/data_loader/mman_windows.h b/extension/data_loader/mman_windows.h
index 563db5d8b21..b9e678c121d 100644
--- a/extension/data_loader/mman_windows.h
+++ b/extension/data_loader/mman_windows.h
@@ -31,6 +31,7 @@
 #endif
 
 #include <sys/types.h>
+#include <cstdint>
 
 #ifdef __cplusplus
 extern "C" {
@@ -56,7 +57,13 @@ extern "C" {
 #define MS_SYNC 2
 #define MS_INVALIDATE 4
 
-void* mmap(void* addr, size_t len, int prot, int flags, int fildes, off_t off);
+void* mmap(
+    void* addr,
+    size_t len,
+    int prot,
+    int flags,
+    int fildes,
+    std::uint64_t off);
 int munmap(void* addr, size_t len);
 int mprotect(void* addr, size_t len, int prot);
 int msync(void* addr, size_t len, int flags);
diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp
index 541fc36bfa7..433de270703 100644
--- a/extension/data_loader/mmap_data_loader.cpp
+++ b/extension/data_loader/mmap_data_loader.cpp
@@ -11,6 +11,7 @@
 #include <cerrno>
 #include <cstring>
 #include <limits>
+#include <cstdint>
 
 #include <fcntl.h>
 #include <sys/stat.h>
@@ -180,12 +181,6 @@ Error MmapDataLoader::validate_input(size_t offset, size_t size) const {
       offset,
       size,
       file_size_);
-  ET_CHECK_OR_RETURN_ERROR(
-      // Recommended by a lint warning.
-      offset <= std::numeric_limits<off_t>::max(),
-      InvalidArgument,
-      "Offset %zu too large for off_t",
-      offset);
   return Error::Ok;
 }
 
@@ -220,13 +215,19 @@ Result<FreeableBuffer> MmapDataLoader::load(
 
   // Map the pages read-only. Use shared mappings so that other processes
   // can also map the same pages and share the same memory.
+#if defined(_WIN32)
+  const std::uint64_t map_offset = static_cast<std::uint64_t>(range.start);
+#else
+  const off_t map_offset = static_cast<off_t>(range.start);
+#endif
+
   void* pages = ::mmap(
       nullptr,
       map_size,
       PROT_READ,
       MAP_SHARED,
       fd_,
-      static_cast<off_t>(range.start));
+      map_offset);
   ET_CHECK_OR_RETURN_ERROR(
       pages != MAP_FAILED,
       AccessFailed,
@@ -328,13 +329,19 @@ Error MmapDataLoader::load_into(
   // Map the pages read-only. MAP_PRIVATE vs. MAP_SHARED doesn't matter since
   // the data is read-only, but use PRIVATE just to further avoid accidentally
   // modifying the file.
+#if defined(_WIN32)
+  const std::uint64_t map_offset = static_cast<std::uint64_t>(range.start);
+#else
+  const off_t map_offset = static_cast<off_t>(range.start);
+#endif
+
   void* pages = ::mmap(
       nullptr,
       map_size,
       PROT_READ,
       MAP_PRIVATE,
       fd_,
-      static_cast<off_t>(range.start));
+      map_offset);
   ET_CHECK_OR_RETURN_ERROR(
       pages != MAP_FAILED,
       AccessFailed,

From 0ab474659090397b99d745d604c6bf13ea55ceef Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Sat, 18 Oct 2025 22:15:32 -0700
Subject: [PATCH 34/34] minmax issue

---
 extension/data_loader/mman_windows.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/extension/data_loader/mman_windows.cpp b/extension/data_loader/mman_windows.cpp
index 847b2aefadd..dec991376cf 100644
--- a/extension/data_loader/mman_windows.cpp
+++ b/extension/data_loader/mman_windows.cpp
@@ -23,7 +23,9 @@
 #include <io.h>
 #include <limits>
 #include <cstdint>
+#define NOMINMAX
 #include <windows.h>
+#undef NOMINMAX
 
 #ifndef STATUS_SECTION_TOO_BIG
 #define STATUS_SECTION_TOO_BIG 0xC0000040L