diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b96c12fbf3..e04760f659d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,11 +99,13 @@ announce_configured_options(CCACHE_PROGRAM)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+# Force logging to always be enabled for this build.
 if(NOT EXECUTORCH_ENABLE_LOGGING)
-  # Avoid pulling in the logging strings, which can be large. Note that this
-  # will set the compiler flag for all targets in this directory, and for all
-  # subdirectories included after this point.
-  add_definitions(-DET_LOG_ENABLED=0)
+  message(STATUS "EXECUTORCH_ENABLE_LOGGING was OFF; forcing it to ON.")
+  set(EXECUTORCH_ENABLE_LOGGING
+      ON
+      CACHE BOOL "Build with ET_LOG_ENABLED" FORCE
+  )
 endif()
 
 add_definitions(-DET_MIN_LOG_LEVEL=${ET_MIN_LOG_LEVEL})
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index ac87d49d5a5..5624f5a4aa0 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -16,8 +16,10 @@ namespace aoti {
 
 namespace internal {
 // Global storage for tensor metadata
-std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
-std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
+    tensor_to_sizes;
+AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
+    tensor_to_strides;
 } // namespace internal
 
 extern "C" {
@@ -74,9 +76,7 @@ AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
     for (int i = 0; i < tensor->dim(); i++) {
       strides[i] = tensor_strides[i];
     }
-    it =
-        internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
-            .first;
+    it = internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides)).first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -122,8 +122,7 @@ AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
     for (int i = 0; i < tensor->dim(); i++) {
       sizes[i] = tensor_sizes[i];
     }
-    it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
-             .first;
+    it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)).first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -200,6 +199,69 @@ void cleanup_tensor_metadata() {
   internal::tensor_to_strides.clear();
 }
 
+void aoti_torch_warn(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg) {
+  ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg);
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) {
+  (void)tensor;
+  (void)ret_size;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor) {
+  (void)self;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) {
+  (void)self;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle) {
+  (void)orig_handle;
+  (void)new_handle;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data_ptr,
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor) {
+  (void)data_ptr;
+  (void)ndim;
+  (void)sizes;
+  (void)strides;
+  (void)storage_offset;
+  (void)dtype;
+  (void)device_type;
+  (void)device_index;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
 } // extern "C"
 
 } // namespace aoti
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index 1b0429e3aba..40849a9d5af 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -15,6 +15,13 @@
 #include <unordered_map>
 #include <vector>
 
+#if defined(BUILDING_CUDA_BACKEND)
+#include <executorch/backends/cuda/runtime/export.h>
+#define AOTI_SHIM_EXPORT AOTI_CUDA_EXPORT
+#else
+#define AOTI_SHIM_EXPORT
+#endif
+
 namespace executorch {
 namespace backends {
 namespace aoti {
@@ -23,56 +30,89 @@ namespace aoti {
 using executorch::runtime::Error;
 using executorch::runtime::etensor::Tensor;
 
+// Global storage for tensor metadata
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+
 extern "C" {
 
 // Common AOTI type aliases
 using AOTIRuntimeError = Error;
 using AOTITorchError = Error;
 
-// Global storage for tensor metadata
-extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
-extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
-
 // Attribute-related operations (memory-irrelevant)
-AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
 
-AOTITorchError aoti_torch_get_storage_offset(
-    Tensor* tensor,
-    int64_t* ret_storage_offset);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_offset(Tensor* tensor, int64_t* ret_storage_offset);
 
-AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
 
-AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
 
-AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
 
-AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
 
-AOTITorchError aoti_torch_get_device_index(
-    Tensor* tensor,
-    int32_t* ret_device_index);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index);
 
-AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
 
 // Utility functions for device and layout information
-int32_t aoti_torch_device_type_cpu();
-int32_t aoti_torch_layout_strided();
-int32_t aoti_torch_dtype_float32();
-int32_t aoti_torch_dtype_bfloat16();
-int32_t aoti_torch_dtype_int8();
-int32_t aoti_torch_dtype_int16();
-int32_t aoti_torch_dtype_int32();
-int32_t aoti_torch_dtype_int64();
+AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu();
+AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64();
 
 // Dtype utility function needed by Metal backend
-size_t aoti_torch_dtype_element_size(int32_t dtype);
+AOTI_SHIM_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype);
 
 // Autograd mode functions
-int32_t aoti_torch_grad_mode_is_enabled();
-void aoti_torch_grad_mode_set_enabled(bool enabled);
+AOTI_SHIM_EXPORT int32_t aoti_torch_grad_mode_is_enabled();
+AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
 
 // Cleanup functions for clearing global state
-void cleanup_tensor_metadata();
+AOTI_SHIM_EXPORT void cleanup_tensor_metadata();
+
+AOTI_SHIM_EXPORT void aoti_torch_warn(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle);
+
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data_ptr,
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor);
 
 } // extern "C"
 
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 8b94351d469..3d0aa450b44 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -43,8 +43,28 @@ set(_aoti_cuda_sources
     runtime/shims/cuda_guard.cpp
     runtime/shims/int4mm.cu
     runtime/platform/platform.cpp
+    ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
 )
-add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+# Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC
+if(MSVC)
+  add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
+  # Define export macros for Windows DLL
+  target_compile_definitions(
+    aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS BUILDING_CUDA_BACKEND
+  )
+  # Ensure proper DLL import/export library naming on Windows with
+  # config-specific paths
+  set_target_properties(
+    aoti_cuda
+    PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via
+                                              # AOTI_CUDA_EXPORT
+               RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$<CONFIG>
+               LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
+               ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
+  )
+else()
+  add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+endif()
 target_include_directories(
   aoti_cuda
   PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
@@ -64,11 +84,15 @@ target_link_options(
 
 # Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
 target_link_libraries(
-  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda PUBLIC extension_tensor CUDA::cudart ${CMAKE_DL_LIBS}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
-executorch_target_link_options_shared_lib(aoti_cuda)
+
+# Only apply shared lib options on non-Windows platforms
+if(NOT MSVC)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
 
 if(BUILD_TESTING)
   # Add runtime
@@ -82,5 +106,7 @@ endif()
 install(
   TARGETS aoti_cuda
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  LIBRARY DESTINATION lib
+  ARCHIVE DESTINATION lib
+  RUNTIME DESTINATION bin
 )
diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h
new file mode 100644
index 00000000000..c15ac7ee8aa
--- /dev/null
+++ b/backends/cuda/runtime/CudaBackend.h
@@ -0,0 +1,46 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <executorch/backends/cuda/runtime/export.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+namespace executorch::backends::cuda {
+
+class AOTI_CUDA_EXPORT ET_EXPERIMENTAL CudaBackend final
+    : public ::executorch::runtime::BackendInterface {
+ public:
+  /**
+   * Check if the CUDA backend is available.
+   */
+  bool is_available() const override;
+
+  /**
+   * Initialize the backend with the given context and compile specs.
+   * Called once per loaded binary blob.
+   */
+  ::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init(
+      ::executorch::runtime::BackendInitContext& context,
+      ::executorch::runtime::FreeableBuffer* processed,
+      ::executorch::runtime::ArrayRef<::executorch::runtime::CompileSpec>
+          compile_specs) const override;
+
+  /**
+   * Execute the backend with the given context and arguments.
+   * Called once per execution.
+   */
+  ::executorch::runtime::Error execute(
+      ::executorch::runtime::BackendExecutionContext& context,
+      ::executorch::runtime::DelegateHandle* handle,
+      ::executorch::runtime::Span<::executorch::runtime::EValue*> args)
+      const override;
+
+  /**
+   * Destroy the backend handle and clean up resources.
+   */
+  void destroy(::executorch::runtime::DelegateHandle* handle) const override;
+};
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 0cef859ddfb..029f7ad16bb 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -11,6 +11,7 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <cassert>
 #include <cstdio>
 
 #include <filesystem>
@@ -18,9 +19,10 @@
 #include <string>
 #include <vector>
 
-// Include our shim layer headers
+// Include class header and shim layer headers
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/CudaBackend.h>
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
@@ -46,12 +48,9 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
-class ET_EXPERIMENTAL CudaBackend final
-    : public ::executorch::runtime::BackendInterface {
- private:
-  Error load_function_pointers_into_handle(
-      void* so_handle,
-      AOTIDelegateHandle* handle) const {
+static Error load_function_pointers_into_handle(
+    void* so_handle,
+    AOTIDelegateHandle* handle) {
 #define LOAD_SYMBOL(member, name)                                    \
   do {                                                               \
     auto symbol_res = get_function(so_handle, #name);                \
@@ -61,319 +60,321 @@ class ET_EXPERIMENTAL CudaBackend final
     handle->member = reinterpret_cast<name##Func>(symbol_res.get()); \
   } while (0)
 
-    LOAD_SYMBOL(create_with_device, AOTInductorModelContainerCreateWithDevice);
+  LOAD_SYMBOL(create_with_device, AOTInductorModelContainerCreateWithDevice);
 
-    LOAD_SYMBOL(delete_container, AOTInductorModelContainerDelete);
+  LOAD_SYMBOL(delete_container, AOTInductorModelContainerDelete);
 
-    LOAD_SYMBOL(get_num_inputs, AOTInductorModelContainerGetNumInputs);
+  LOAD_SYMBOL(get_num_inputs, AOTInductorModelContainerGetNumInputs);
 
-    LOAD_SYMBOL(get_num_outputs, AOTInductorModelContainerGetNumOutputs);
+  LOAD_SYMBOL(get_num_outputs, AOTInductorModelContainerGetNumOutputs);
 
-    LOAD_SYMBOL(run, AOTInductorModelContainerRun);
+  LOAD_SYMBOL(run, AOTInductorModelContainerRun);
 #undef LOAD_SYMBOL
 
-    auto symbol_res =
-        get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob");
-    if (symbol_res.ok()) {
-      handle->update_constants_from_blob =
-          reinterpret_cast<AOTInductorModelUpdateConstantsFromBlobFunc>(
-              symbol_res.get());
-    } else {
-      ET_LOG(
-          Info,
-          "Failed to load AOTInductorModelUpdateConstantsFromBlob. This .so is probably compiled on an old version of torch (<2.9.0)");
+  auto symbol_res =
+      get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob");
+  if (symbol_res.ok()) {
+    handle->update_constants_from_blob =
+        reinterpret_cast<AOTInductorModelUpdateConstantsFromBlobFunc>(
+            symbol_res.get());
+  } else {
+    ET_LOG(
+        Info,
+        "Failed to load AOTInductorModelUpdateConstantsFromBlob. This .so is probably compiled on an old version of torch (<2.9.0)");
+  }
+  return Error::Ok;
+}
+
+bool CudaBackend::is_available() const {
+  return 1;
+}
+
+Result<DelegateHandle*> CudaBackend::init(
+    BackendInitContext& context,
+    FreeableBuffer* processed,
+    ArrayRef<CompileSpec> compile_specs) const {
+  std::string method_name;
+  for (const CompileSpec& spec : compile_specs) {
+    if (std::strcmp(spec.key, "method_name") == 0) {
+      method_name.assign(
+          static_cast<const char*>(spec.value.buffer),
+          spec.value.nbytes); // no nullptr guarantee, so pass size
+      break;
     }
-    return Error::Ok;
   }
 
- public:
-  bool is_available() const override {
-    return 1;
+  std::string so_blob_key =
+      method_name.empty() ? "so_blob" : method_name + "_so_blob";
+
+  const NamedDataMap* named_data_map = context.get_named_data_map();
+  auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str());
+  ET_CHECK_OR_RETURN_ERROR(
+      aoti_dso_buffer.ok(),
+      Internal,
+      "Failed to get data for key %s: 0x%x",
+      so_blob_key.c_str(),
+      static_cast<uint32_t>(aoti_dso_buffer.error()));
+
+  // Generate dynamic temporary file path
+  filesystem::path temp_dir = filesystem::temp_directory_path();
+  filesystem::path so_path =
+      temp_dir / (so_blob_key + to_string(get_process_id()) + ".so");
+
+  // Create a temporary file
+  ofstream outfile(so_path, ios::binary);
+
+  // Write the ELF buffer to the temporary file
+  ET_LOG(
+      Info,
+      "Writing %zu bytes to %s",
+      aoti_dso_buffer->size(),
+      so_path.c_str());
+
+  outfile.write(
+      static_cast<const char*>(aoti_dso_buffer->data()),
+      aoti_dso_buffer->size());
+
+  ET_CHECK_OR_RETURN_ERROR(
+      outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
+
+  // Finish writing the file to disk
+  outfile.close();
+
+  // Free the buffer immediately after writing to disk
+  aoti_dso_buffer->Free();
+  // Load the lib
+  Result<void*> lib_handle_res = load_library(so_path);
+  if (!lib_handle_res.ok()) {
+    return lib_handle_res.error();
   }
+  void* lib_handle = lib_handle_res.get();
 
-  // Once per loaded binary blob
-  Result<DelegateHandle*> init(
-      BackendInitContext& context,
-      FreeableBuffer* processed, // This will be a empty buffer
-      ArrayRef<CompileSpec> compile_specs // This will be my empty list
-  ) const override {
-    std::string method_name;
-    for (const CompileSpec& spec : compile_specs) {
-      if (std::strcmp(spec.key, "method_name") == 0) {
-        method_name.assign(
-            static_cast<const char*>(spec.value.buffer),
-            spec.value.nbytes); // no nullptr guarantee, so pass size
-        break;
-      }
-    }
-
-    std::string so_blob_key =
-        method_name.empty() ? "so_blob" : method_name + "_so_blob";
-
-    const NamedDataMap* named_data_map = context.get_named_data_map();
-    auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str());
-    ET_CHECK_OR_RETURN_ERROR(
-        aoti_dso_buffer.ok(),
-        Internal,
-        "Failed to get data for key %s: 0x%x",
-        so_blob_key.c_str(),
-        static_cast<uint32_t>(aoti_dso_buffer.error()));
+  processed->Free();
 
-    // Generate dynamic temporary file path
-    filesystem::path temp_dir = filesystem::temp_directory_path();
-    filesystem::path so_path =
-        temp_dir / (so_blob_key + to_string(get_process_id()) + ".so");
+  // Create handle and load function pointers into it
+  AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+  handle->so_handle = lib_handle;
+  handle->so_path = so_path.string();
 
-    // Create a temporary file
-    ofstream outfile(so_path, ios::binary);
+  // Load function pointers specific to this handle's shared library
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      load_function_pointers_into_handle(lib_handle, handle));
 
-    // Write the ELF buffer to the temporary file
-    ET_LOG(
-        Info,
-        "Writing %zu bytes to %s",
-        aoti_dso_buffer->size(),
-        so_path.c_str());
+  AOTInductorModelContainerHandle container_handle = nullptr;
 
-    outfile.write(
-        static_cast<const char*>(aoti_dso_buffer->data()),
-        aoti_dso_buffer->size());
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      handle->create_with_device(&container_handle, 1, "cuda", nullptr));
 
-    ET_CHECK_OR_RETURN_ERROR(
-        outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
+  ET_LOG(Info, "container_handle = %p", container_handle);
 
-    // Finish writing the file to disk
-    outfile.close();
+  handle->container_handle = container_handle;
 
-    // Free the buffer immediately after writing to disk
-    aoti_dso_buffer->Free();
-    // Load the lib
-    Result<void*> lib_handle_res = load_library(so_path);
-    if (!lib_handle_res.ok()) {
-      return lib_handle_res.error();
-    }
-    void* lib_handle = lib_handle_res.get();
-
-    processed->Free();
-
-    // Create handle and load function pointers into it
-    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
-    handle->so_handle = lib_handle;
-    handle->so_path = so_path.string();
-
-    // Load function pointers specific to this handle's shared library
-    ET_CHECK_OK_OR_RETURN_ERROR(
-        load_function_pointers_into_handle(lib_handle, handle));
-
-    AOTInductorModelContainerHandle container_handle = nullptr;
-
-    ET_CHECK_OK_OR_RETURN_ERROR(
-        handle->create_with_device(&container_handle, 1, "cuda", nullptr));
-
-    ET_LOG(Info, "container_handle = %p", container_handle);
-
-    handle->container_handle = container_handle;
-
-    // Look into named data map for constant data
-    std::string weights_blob_key =
-        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
-    auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
-    if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
-      ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
-      const void* weights_blob = buffer_res->data();
-      // Feed the weights blob into the container. Under the hood it's copying
-      // weights, so we should free the buffer immediately.
-      ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob(
-          handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
-      buffer_res->Free();
-    }
-    // Create a CUDA stream for asynchronous execution
-    cudaStream_t cuda_stream;
-    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
-    handle->cuda_stream = static_cast<void*>(cuda_stream);
-
-    return (DelegateHandle*)handle; // Return the handle post-processing
+  // Look into named data map for constant data
+  std::string weights_blob_key =
+      method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
+  auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
+  if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
+    ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
+    const void* weights_blob = buffer_res->data();
+    // Feed the weights blob into the container. Under the hood it's copying
+    // weights, so we should free the buffer immediately.
+    ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob(
+        handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
+    buffer_res->Free();
   }
+  // Create a CUDA stream for asynchronous execution
+  cudaStream_t cuda_stream;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
+  handle->cuda_stream = static_cast<void*>(cuda_stream);
+
+  return (DelegateHandle*)handle; // Return the handle post-processing
+}
+
+Error CudaBackend::execute(
+    BackendExecutionContext& context,
+    DelegateHandle* handle_,
+    Span<EValue*> args) const {
+  AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+  size_t n_inputs;
+  handle->get_num_inputs(handle->container_handle, &n_inputs);
+
+  size_t n_outputs;
+  handle->get_num_outputs(handle->container_handle, &n_outputs);
+
+  ET_CHECK_OR_RETURN_ERROR(
+      n_inputs + n_outputs == args.size(),
+      InvalidArgument,
+      "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+      n_inputs,
+      n_outputs,
+      args.size())
+
+  // NOTE: ExecuTorch tensors are always on CPU/host memory
+  // We need to create GPU copies for CUDA kernel execution
+  std::vector<AOTITensorHandle> gpu_inputs(
+      n_inputs); // GPU copies for kernel execution
+  std::vector<AOTITensorHandle> gpu_outputs(
+      n_outputs); // GPU tensors for kernel output
+
+  // Process input tensors: ExecuTorch provides CPU tensors, create GPU
+  // copies
+  for (int i = 0; i < n_inputs; i++) {
+    // Get tensor dimensions and properties from ExecuTorch CPU tensor
+    auto cpu_tensor = &(args[i]->toTensor());
+    auto sizes = cpu_tensor->sizes();
+    auto scalar_type = cpu_tensor->scalar_type();
+
+    // Create GPU tensor with same shape
+    std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+    AOTITensorHandle gpu_input_handle;
+    Error create_err = aoti_torch_empty_strided(
+        sizes_vec.size(),
+        sizes_vec.data(),
+        nullptr, // use default strides
+        static_cast<int32_t>(scalar_type),
+        1, // device_type = cuda
+        0, // device_index = 0
+        &gpu_input_handle);
 
-  // Once per execution
-  Error execute(
-      BackendExecutionContext& context,
-      DelegateHandle* handle_,
-      Span<EValue*> args) const override {
-    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
-
-    size_t n_inputs;
-    handle->get_num_inputs(handle->container_handle, &n_inputs);
+    ET_CHECK_OR_RETURN_ERROR(
+        create_err == Error::Ok,
+        Internal,
+        "Failed to create GPU tensor for input %d",
+        i);
 
-    size_t n_outputs;
-    handle->get_num_outputs(handle->container_handle, &n_outputs);
+    gpu_inputs[i] = gpu_input_handle;
 
+    // Copy data from CPU to GPU
     ET_CHECK_OR_RETURN_ERROR(
-        n_inputs + n_outputs == args.size(),
-        InvalidArgument,
-        "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
-        n_inputs,
-        n_outputs,
-        args.size())
-
-    // NOTE: ExecuTorch tensors are always on CPU/host memory
-    // We need to create GPU copies for CUDA kernel execution
-    std::vector<AOTITensorHandle> gpu_inputs(
-        n_inputs); // GPU copies for kernel execution
-    std::vector<AOTITensorHandle> gpu_outputs(
-        n_outputs); // GPU tensors for kernel output
-
-    // Process input tensors: ExecuTorch provides CPU tensors, create GPU
-    // copies
-    for (int i = 0; i < n_inputs; i++) {
-      // Get tensor dimensions and properties from ExecuTorch CPU tensor
-      auto cpu_tensor = &(args[i]->toTensor());
-      auto sizes = cpu_tensor->sizes();
-      auto scalar_type = cpu_tensor->scalar_type();
-
-      // Create GPU tensor with same shape
-      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
-
-      AOTITensorHandle gpu_input_handle;
-      Error create_err = aoti_torch_empty_strided(
-          sizes_vec.size(),
-          sizes_vec.data(),
-          nullptr, // use default strides
-          static_cast<int32_t>(scalar_type),
-          1, // device_type = cuda
-          0, // device_index = 0
-          &gpu_input_handle);
-
-      ET_CHECK_OR_RETURN_ERROR(
-          create_err == Error::Ok,
-          Internal,
-          "Failed to create GPU tensor for input %d",
-          i);
-
-      gpu_inputs[i] = gpu_input_handle;
-
-      // Copy data from CPU to GPU
-      ET_CHECK_OR_RETURN_ERROR(
-          aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
-          Internal,
-          "Failed to copy input %d from CPU to GPU",
-          i);
-    }
-    // Process output tensors: create GPU counterparts for ExecuTorch CPU
-    // tensors
-    for (int i = 0; i < n_outputs; i++) {
-      // Get output tensor dimensions from ExecuTorch CPU tensor
-      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
-      auto sizes = cpu_output_tensor->sizes();
-      auto scalar_type = cpu_output_tensor->scalar_type();
-
-      // Create GPU tensor with same shape for kernel output
-      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
-
-      AOTITensorHandle gpu_output_handle;
-      Error create_err = aoti_torch_empty_strided(
-          sizes_vec.size(),
-          sizes_vec.data(),
-          nullptr, // use default strides
-          static_cast<int32_t>(scalar_type),
-          1, // device_type = cuda
-          0, // device_index = 0
-          &gpu_output_handle);
-
-      ET_CHECK_OR_RETURN_ERROR(
-          create_err == Error::Ok,
-          Internal,
-          "Failed to create GPU tensor for output %d",
-          i);
-
-      gpu_outputs[i] = gpu_output_handle;
-    }
-    // Run AOTI container with GPU tensors
-    AOTIRuntimeError error = handle->run(
-        handle->container_handle,
-        gpu_inputs.data(), // Use GPU input tensors
-        n_inputs,
-        gpu_outputs.data(), // Use GPU output tensors
-        n_outputs,
-        handle->cuda_stream, // Pass the actual CUDA stream
-        nullptr); // proxy_executor_handle can remain nullptr
+        aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
+        Internal,
+        "Failed to copy input %d from CPU to GPU",
+        i);
+  }
+  // Process output tensors: create GPU counterparts for ExecuTorch CPU
+  // tensors
+  for (int i = 0; i < n_outputs; i++) {
+    // Get output tensor dimensions from ExecuTorch CPU tensor
+    auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+    auto sizes = cpu_output_tensor->sizes();
+    auto scalar_type = cpu_output_tensor->scalar_type();
+
+    // Create GPU tensor with same shape for kernel output
+    std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+    AOTITensorHandle gpu_output_handle;
+    Error create_err = aoti_torch_empty_strided(
+        sizes_vec.size(),
+        sizes_vec.data(),
+        nullptr, // use default strides
+        static_cast<int32_t>(scalar_type),
+        1, // device_type = cuda
+        0, // device_index = 0
+        &gpu_output_handle);
 
     ET_CHECK_OR_RETURN_ERROR(
-        error == Error::Ok,
+        create_err == Error::Ok,
         Internal,
-        "AOTInductorModelContainerRun failed with error code %d",
-        error);
-
-    // Copy GPU output results back to CPU output tensors
-    for (int i = 0; i < n_outputs; i++) {
-      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
-      // For DYNAMIC_BOUND tensors we try to resize
-      ET_CHECK_OK_OR_RETURN_ERROR(
-          resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
-          "Error resizing tensor at output index %d",
-          i);
-      ET_CHECK_OK_OR_RETURN_ERROR(
-          aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
-          "Failed to copy GPU output %d back to CPU",
-          i);
-    }
+        "Failed to create GPU tensor for output %d",
+        i);
 
-    return Error::Ok;
+    gpu_outputs[i] = gpu_output_handle;
+  }
+  // Run AOTI container with GPU tensors
+  AOTIRuntimeError error = handle->run(
+      handle->container_handle,
+      gpu_inputs.data(), // Use GPU input tensors
+      n_inputs,
+      gpu_outputs.data(), // Use GPU output tensors
+      n_outputs,
+      handle->cuda_stream, // Pass the actual CUDA stream
+      nullptr); // proxy_executor_handle can remain nullptr
+
+  ET_CHECK_OR_RETURN_ERROR(
+      error == Error::Ok,
+      Internal,
+      "AOTInductorModelContainerRun failed with error code %d",
+      error);
+
+  // Copy GPU output results back to CPU output tensors
+  for (int i = 0; i < n_outputs; i++) {
+    auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+    // For DYNAMIC_BOUND tensors we try to resize
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
+        "Error resizing tensor at output index %d",
+        i);
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
+        "Failed to copy GPU output %d back to CPU",
+        i);
   }
 
-  void destroy(DelegateHandle* handle_) const override {
-    if (handle_ == nullptr) {
-      return;
-    }
-    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
-
-    // Destroy the CUDA stream if it exists
-    if (handle->cuda_stream != nullptr) {
-      cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
-      cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
-      ET_CHECK_OR_LOG_ERROR(
-          stream_err == cudaSuccess,
-          "Failed to destroy CUDA stream: %s",
-          cudaGetErrorString(stream_err));
-      handle->cuda_stream = nullptr;
-    }
+  return Error::Ok;
+}
 
-    // NOTE: AOTInductorModelContainerDelete does not work correctly with
-    // multiple .so files. Deleting one container frees shared resources,
-    // which causes segmentation faults when attempting to delete other
-    // containers. As a workaround, we skip explicit container deletion
-    // and defer cleanup to the OS.
-    // TODO(gasoonjia): Find a proper solution for safe container deletion.
-    // AOTInductorModelContainerDelete(handle->container_handle);
-
-    // Now close the shared library
-    auto err = Error::Ok;
-    if (handle->so_handle != nullptr) {
-      err = close_library(handle->so_handle);
-    }
+void CudaBackend::destroy(DelegateHandle* handle_) const {
+  if (handle_ == nullptr) {
+    return;
+  }
+  AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+  // Destroy the CUDA stream if it exists
+  if (handle->cuda_stream != nullptr) {
+    cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
+    cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
+    ET_CHECK_OR_LOG_ERROR(
+        stream_err == cudaSuccess,
+        "Failed to destroy CUDA stream: %s",
+        cudaGetErrorString(stream_err));
+    handle->cuda_stream = nullptr;
+  }
 
-    // Remove the temporary shared library file
-    if (!handle->so_path.empty()) {
-      std::error_code remove_error;
-      std::filesystem::remove(handle->so_path, remove_error);
-      ET_CHECK_OR_LOG_ERROR(
-          !remove_error,
-          "Failed to remove temporary shared library %s: %s",
-          handle->so_path.c_str(),
-          remove_error.message().c_str());
-    }
+  // NOTE: AOTInductorModelContainerDelete does not work correctly with
+  // multiple .so files. Deleting one container frees shared resources,
+  // which causes segmentation faults when attempting to delete other
+  // containers. As a workaround, we skip explicit container deletion
+  // and defer cleanup to the OS.
+  // TODO(gasoonjia): Find a proper solution for safe container deletion.
+  // AOTInductorModelContainerDelete(handle->container_handle);
+
+  // Now close the shared library
+  auto err = Error::Ok;
+  if (handle->so_handle != nullptr) {
+    err = close_library(handle->so_handle);
+  }
 
-    delete handle;
-    clear_all_tensors();
+  // Remove the temporary shared library file
+  if (!handle->so_path.empty()) {
+    std::error_code remove_error;
+    std::filesystem::remove(handle->so_path, remove_error);
+    ET_CHECK_OR_LOG_ERROR(
+        !remove_error,
+        "Failed to remove temporary shared library %s: %s",
+        handle->so_path.c_str(),
+        remove_error.message().c_str());
   }
-};
+
+  delete handle;
+  clear_all_tensors();
+}
 
 } // namespace executorch::backends::cuda
 
 namespace executorch::backends {
-namespace {
+
+// Backend instance - static on all platforms
 auto cls = cuda::CudaBackend();
-executorch::runtime::Backend backend{"CudaBackend", &cls};
+executorch::runtime::Backend cuda_backend{"CudaBackend", &cls};
+
+#ifndef _WIN32
+// On non-Windows platforms, use automatic static initialization
+namespace {
 static executorch::runtime::Error success_with_compiler =
-    register_backend(backend);
+    register_backend(cuda_backend);
 } // namespace
+#endif
+
 } // namespace executorch::backends
diff --git a/backends/cuda/runtime/cuda_backend_init.h b/backends/cuda/runtime/cuda_backend_init.h
new file mode 100644
index 00000000000..a05dc082857
--- /dev/null
+++ b/backends/cuda/runtime/cuda_backend_init.h
@@ -0,0 +1,35 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+#define CUDA_BACKEND_INIT_EXPORT __declspec(dllexport)
+#define CUDA_BACKEND_INIT_IMPORT __declspec(dllimport)
+#else
+#define CUDA_BACKEND_INIT_EXPORT __attribute__((visibility("default")))
+#define CUDA_BACKEND_INIT_IMPORT
+#endif
+
+// When building the DLL, define BUILDING_CUDA_BACKEND
+// When using the DLL, this will import the function
+#ifdef BUILDING_CUDA_BACKEND
+#define CUDA_BACKEND_INIT_API CUDA_BACKEND_INIT_EXPORT
+#else
+#define CUDA_BACKEND_INIT_API CUDA_BACKEND_INIT_IMPORT
+#endif
+
+/**
+ * Initialize the CUDA backend and register it with the ExecutorTorch runtime.
+ * On Windows, this must be called explicitly before loading models that use
+ * the CUDA backend. On other platforms, the backend is registered automatically
+ * via static initialization.
+ */
+CUDA_BACKEND_INIT_API void InitCudaBackend();
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/backends/cuda/runtime/export.h b/backends/cuda/runtime/export.h
new file mode 100644
index 00000000000..65fe7192611
--- /dev/null
+++ b/backends/cuda/runtime/export.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// Define export macro for Windows DLL
+// When building the aoti_cuda library, EXPORT_AOTI_FUNCTIONS is defined by
+// CMake, which causes this macro to export symbols using __declspec(dllexport).
+// When consuming the library, the macro imports symbols using
+// __declspec(dllimport). On non-Windows platforms, the macro is empty and has
+// no effect.
+#ifdef _WIN32
+#ifdef EXPORT_AOTI_FUNCTIONS
+#define AOTI_CUDA_EXPORT __declspec(dllexport)
+#else
+#define AOTI_CUDA_EXPORT __declspec(dllimport)
+#endif
+#else
+#define AOTI_CUDA_EXPORT
+#endif
diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h
index f930f3df643..ec2b381f524 100644
--- a/backends/cuda/runtime/shims/cuda_guard.h
+++ b/backends/cuda/runtime/shims/cuda_guard.h
@@ -10,6 +10,7 @@
 
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/export.h>
 #include <executorch/backends/cuda/runtime/guard.h>
 #include <cstdint>
 
@@ -33,9 +34,8 @@ using CUDAStreamGuardHandle = CUDAStreamGuard*;
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_create_cuda_guard(
-    int32_t device_index,
-    CUDAGuardHandle* ret_guard);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_create_cuda_guard(int32_t device_index, CUDAGuardHandle* ret_guard);
 
 /**
  * Deletes a CUDA device guard and frees its associated resources.
@@ -44,7 +44,8 @@ AOTITorchError aoti_torch_create_cuda_guard(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
 
 /**
  * Sets the CUDA device to a new index for an existing guard.
@@ -54,9 +55,8 @@ AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_cuda_guard_set_index(
-    CUDAGuardHandle guard,
-    int32_t device_index);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index);
 
 /**
  * Creates a CUDA stream guard that sets the current device and stream,
@@ -69,7 +69,7 @@ AOTITorchError aoti_torch_cuda_guard_set_index(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_create_cuda_stream_guard(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard(
     void* stream,
     int32_t device_index,
     CUDAStreamGuardHandle* ret_guard);
@@ -81,7 +81,8 @@ AOTITorchError aoti_torch_create_cuda_stream_guard(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
 
 /**
  * Gets the current CUDA stream for a specified device.
@@ -91,9 +92,8 @@ AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_get_current_cuda_stream(
-    int32_t device_index,
-    void** ret_stream);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream);
 
 } // extern "C"
 
diff --git a/backends/cuda/runtime/shims/int4mm.h b/backends/cuda/runtime/shims/int4mm.h
index 6bd2d9b3a79..b84a7a4d149 100644
--- a/backends/cuda/runtime/shims/int4mm.h
+++ b/backends/cuda/runtime/shims/int4mm.h
@@ -10,6 +10,7 @@
 
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/export.h>
 
 namespace executorch::backends::cuda {
 
@@ -69,7 +70,7 @@ extern "C" {
  *     or invalid qGroupSize
  *   - Error::Internal: CUDA kernel launch failure
  */
-AOTITorchError aoti_torch_cuda__weight_int4pack_mm(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm(
     Tensor* self,
     Tensor* mat2,
     int64_t qGroupSize,
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
index 7a8d4c3609b..924f268d3ca 100644
--- a/backends/cuda/runtime/shims/memory.h
+++ b/backends/cuda/runtime/shims/memory.h
@@ -10,6 +10,7 @@
 
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/export.h>
 #include <cstdint>
 
 namespace executorch::backends::cuda {
@@ -43,7 +44,7 @@ extern "C" {
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     void* data,
     int64_t ndim,
     const int64_t* sizes_ptr,
@@ -71,7 +72,7 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_empty_strided(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_empty_strided(
     int64_t ndim,
     const int64_t* sizes_ptr,
     const int64_t* strides_ptr,
@@ -87,7 +88,7 @@ AOTITorchError aoti_torch_empty_strided(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
 
 /**
  * Creates a tensor view that reinterprets the same underlying memory with
@@ -106,7 +107,7 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
  *
  * @return Error::Ok on success, appropriate error code on failure
  */
-AOTITorchError aoti_torch__reinterpret_tensor(
+AOTI_CUDA_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
     Tensor* self,
     int64_t ndim,
     const int64_t* sizes_ptr,
@@ -136,11 +137,11 @@ AOTITorchError aoti_torch__reinterpret_tensor(
  *         - Error::MemoryAllocationFailed: failed to allocate temporary memory
  *         - Error::Internal: CUDA operation failures
  */
-AOTITorchError
+AOTI_CUDA_EXPORT AOTITorchError
 aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
 
 // Function to clear all tensors from internal storage
-void clear_all_tensors();
+AOTI_CUDA_EXPORT void clear_all_tensors();
 } // extern "C"
 
 } // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
index 15a4e397d24..3c881df63b3 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.h
+++ b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <executorch/backends/cuda/runtime/export.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
 #include <cstdint>
@@ -24,12 +25,11 @@ extern "C" {
 using AOTITorchError = Error;
 
 // Device type functions for tensor attributes
-AOTITorchError aoti_torch_get_device_type(
-    Tensor* tensor,
-    int32_t* ret_device_type);
+AOTI_CUDA_EXPORT AOTITorchError
+aoti_torch_get_device_type(Tensor* tensor, int32_t* ret_device_type);
 
 // Device type constants
-int32_t aoti_torch_device_type_cuda();
+AOTI_CUDA_EXPORT int32_t aoti_torch_device_type_cuda();
 
 } // extern "C"
 
diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 866d17160ba..cfb2acd3759 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -39,18 +39,16 @@ executorch_target_link_options_shared_lib(executorch)
 set(link_libraries executorch gflags)
 set(_srcs multimodal.cpp)
 
-list(
-  APPEND
-  link_libraries
-  optimized_native_cpu_ops_lib
-  quantized_ops_lib
-  custom_ops
-  cpublas
-  eigen_blas
-)
+# Common ops for all builds
+list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
 executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
-executorch_target_link_options_shared_lib(quantized_ops_lib)
-executorch_target_link_options_shared_lib(custom_ops)
+
+# CPU-only builds need quantized and custom ops
+if(NOT EXECUTORCH_BUILD_CUDA AND MSVC)
+  list(APPEND link_libraries quantized_ops_lib custom_ops)
+  executorch_target_link_options_shared_lib(quantized_ops_lib)
+  executorch_target_link_options_shared_lib(custom_ops)
+endif()
 
 # XNNPACK
 if(TARGET xnnpack_backend)
@@ -86,13 +84,6 @@ list(
   extension_flat_tensor
 )
 
-# Link CUDA backend
-if(EXECUTORCH_BUILD_CUDA)
-  find_package(CUDAToolkit REQUIRED)
-  list(APPEND link_libraries aoti_cuda)
-  executorch_target_link_options_shared_lib(aoti_cuda)
-endif()
-
 if(EXECUTORCH_BUILD_METAL)
   list(APPEND link_libraries metal_backend)
   executorch_target_link_options_shared_lib(metal_backend)
@@ -104,11 +95,24 @@ list(APPEND link_libraries tokenizers::tokenizers)
 add_executable(voxtral_runner ${_srcs})
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(voxtral_runner)
-  if(NOT APPLE)
+  if(NOT APPLE AND NOT MSVC)
     target_link_options(voxtral_runner PRIVATE "LINKER:-s")
   endif()
 endif()
 
+# Link CUDA backend
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  if(MSVC)
+    # On MSVC, link directly without shared lib options to use import library
+    list(APPEND link_libraries aoti_cuda)
+  else()
+    # On non-MSVC, use shared lib options
+    list(APPEND link_libraries aoti_cuda)
+    executorch_target_link_options_shared_lib(aoti_cuda)
+  endif()
+endif()
+
 target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
 target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
 target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index 29edf955751..ff483cce516 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -25,6 +25,10 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/log.h>
 
+// Manually register the CUDA backend
+#include <executorch/backends/cuda/runtime/CudaBackend.h>
+#include <executorch/runtime/backend/interface.h>
+
 #if defined(ET_USE_THREADPOOL)
 #include <executorch/extension/threadpool/cpuinfo_utils.h>
 #include <executorch/extension/threadpool/threadpool.h>
@@ -283,6 +287,22 @@ MultimodalInput processAudioFile(
 } // namespace
 
 int32_t main(int32_t argc, char** argv) {
+  // Manually register the CUDA backend (required on Windows, harmless on other
+  // platforms)
+  ET_LOG(Info, "Registering CUDA backend");
+  static auto cuda_backend_impl = ::executorch::backends::cuda::CudaBackend();
+  static auto cuda_backend =
+      ::executorch::runtime::Backend{"CudaBackend", &cuda_backend_impl};
+  auto register_error = ::executorch::runtime::register_backend(cuda_backend);
+  if (register_error == ::executorch::runtime::Error::Ok) {
+    ET_LOG(Info, "Successfully registered CudaBackend");
+  } else {
+    ET_LOG(
+        Error,
+        "Failed to register CudaBackend: error code %d",
+        (int)register_error);
+  }
+
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   const char* model_path = FLAGS_model_path.c_str();
diff --git a/extension/data_loader/mman_windows.cpp b/extension/data_loader/mman_windows.cpp
index 89f9f22f467..dec991376cf 100644
--- a/extension/data_loader/mman_windows.cpp
+++ b/extension/data_loader/mman_windows.cpp
@@ -21,7 +21,11 @@
 
 #include <errno.h>
 #include <io.h>
+#include <limits>
+#include <cstdint>
+#define NOMINMAX
 #include <windows.h>
+#undef NOMINMAX
 
 #ifndef STATUS_SECTION_TOO_BIG
 #define STATUS_SECTION_TOO_BIG 0xC0000040L
@@ -129,49 +133,42 @@ static DWORD __map_mmap_prot_file(const int prot) {
 
 } // namespace
 
-void* mmap(void* addr, size_t len, int prot, int flags, int fildes, off_t off) {
+void* mmap(
+    void* addr,
+    size_t len,
+    int prot,
+    int flags,
+    int fildes,
+    std::uint64_t off) {
   HANDLE fm, h;
-
   void* map = MAP_FAILED;
 
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4293)
-#endif
-
-  const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD))
-      ? (DWORD)off
-      : (DWORD)(off & 0xFFFFFFFFL);
-  const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD))
-      ? (DWORD)0
-      : (DWORD)((off >> 32) & 0xFFFFFFFFL);
-  const DWORD protect = __map_mmap_prot_page(prot);
-  const DWORD desiredAccess = __map_mmap_prot_file(prot);
-
-  const off_t maxSize = off + (off_t)len;
-
-  const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD))
-      ? (DWORD)maxSize
-      : (DWORD)(maxSize & 0xFFFFFFFFL);
-  const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD))
-      ? (DWORD)0
-      : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL);
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
   errno = 0;
 
   if (len == 0
       /* Unsupported flag combinations */
       || (flags & MAP_FIXED) != 0
-      /* Usupported protection combinations */
+      /* Unsupported protection combinations */
       || prot == PROT_EXEC) {
     errno = EINVAL;
     return MAP_FAILED;
   }
 
+  if (off > std::numeric_limits<std::uint64_t>::max() - len) {
+    errno = EINVAL;
+    return MAP_FAILED;
+  }
+
+  const std::uint64_t maxSize = off + static_cast<std::uint64_t>(len);
+
+  const DWORD dwFileOffsetLow = static_cast<DWORD>(off & 0xFFFFFFFFULL);
+  const DWORD dwFileOffsetHigh = static_cast<DWORD>((off >> 32) & 0xFFFFFFFFULL);
+  const DWORD protect = __map_mmap_prot_page(prot);
+  const DWORD desiredAccess = __map_mmap_prot_file(prot);
+
+  const DWORD dwMaxSizeLow = static_cast<DWORD>(maxSize & 0xFFFFFFFFULL);
+  const DWORD dwMaxSizeHigh = static_cast<DWORD>((maxSize >> 32) & 0xFFFFFFFFULL);
+
   h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes)
                                      : INVALID_HANDLE_VALUE;
 
diff --git a/extension/data_loader/mman_windows.h b/extension/data_loader/mman_windows.h
index 563db5d8b21..b9e678c121d 100644
--- a/extension/data_loader/mman_windows.h
+++ b/extension/data_loader/mman_windows.h
@@ -31,6 +31,7 @@
 #endif
 
 #include <sys/types.h>
+#include <cstdint>
 
 #ifdef __cplusplus
 extern "C" {
@@ -56,7 +57,13 @@ extern "C" {
 #define MS_SYNC 2
 #define MS_INVALIDATE 4
 
-void* mmap(void* addr, size_t len, int prot, int flags, int fildes, off_t off);
+void* mmap(
+    void* addr,
+    size_t len,
+    int prot,
+    int flags,
+    int fildes,
+    std::uint64_t off);
 int munmap(void* addr, size_t len);
 int mprotect(void* addr, size_t len, int prot);
 int msync(void* addr, size_t len, int flags);
diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp
index 10bd2f35f5e..433de270703 100644
--- a/extension/data_loader/mmap_data_loader.cpp
+++ b/extension/data_loader/mmap_data_loader.cpp
@@ -11,6 +11,7 @@
 #include <cerrno>
 #include <cstring>
 #include <limits>
+#include <cstdint>
 
 #include <fcntl.h>
 #include <sys/stat.h>
@@ -94,8 +95,13 @@ Result<MmapDataLoader> MmapDataLoader::from(
   }
 
   // Cache the file size.
+#if defined(_WIN32)
+  struct _stat64 st;
+  int err = ::_fstat64(fd, &st);
+#else
   struct stat st;
   int err = ::fstat(fd, &st);
+#endif
   if (err < 0) {
     ET_LOG(
         Error,
@@ -106,7 +112,15 @@ Result<MmapDataLoader> MmapDataLoader::from(
     ::close(fd);
     return Error::AccessFailed;
   }
-  size_t file_size = st.st_size;
+
+  uint64_t file_size_u64 = static_cast<uint64_t>(st.st_size);
+  ET_CHECK_OR_RETURN_ERROR(
+      file_size_u64 <= std::numeric_limits<size_t>::max(),
+      NotSupported,
+      "File %s is too large (%llu bytes) for current platform",
+      file_name,
+      static_cast<unsigned long long>(file_size_u64));
+  size_t file_size = static_cast<size_t>(file_size_u64);
 
   // Copy the filename so we can print better debug messages if reads fail.
   const char* file_name_copy = ::strdup(file_name);
@@ -167,12 +181,6 @@ Error MmapDataLoader::validate_input(size_t offset, size_t size) const {
       offset,
       size,
       file_size_);
-  ET_CHECK_OR_RETURN_ERROR(
-      // Recommended by a lint warning.
-      offset <= std::numeric_limits<off_t>::max(),
-      InvalidArgument,
-      "Offset %zu too large for off_t",
-      offset);
   return Error::Ok;
 }
 
@@ -207,13 +215,19 @@ Result<FreeableBuffer> MmapDataLoader::load(
 
   // Map the pages read-only. Use shared mappings so that other processes
   // can also map the same pages and share the same memory.
+#if defined(_WIN32)
+  const std::uint64_t map_offset = static_cast<std::uint64_t>(range.start);
+#else
+  const off_t map_offset = static_cast<off_t>(range.start);
+#endif
+
   void* pages = ::mmap(
       nullptr,
       map_size,
       PROT_READ,
       MAP_SHARED,
       fd_,
-      static_cast<off_t>(range.start));
+      map_offset);
   ET_CHECK_OR_RETURN_ERROR(
       pages != MAP_FAILED,
       AccessFailed,
@@ -315,13 +329,19 @@ Error MmapDataLoader::load_into(
   // Map the pages read-only. MAP_PRIVATE vs. MAP_SHARED doesn't matter since
   // the data is read-only, but use PRIVATE just to further avoid accidentally
   // modifying the file.
+#if defined(_WIN32)
+  const std::uint64_t map_offset = static_cast<std::uint64_t>(range.start);
+#else
+  const off_t map_offset = static_cast<off_t>(range.start);
+#endif
+
   void* pages = ::mmap(
       nullptr,
       map_size,
       PROT_READ,
       MAP_PRIVATE,
       fd_,
-      static_cast<off_t>(range.start));
+      map_offset);
   ET_CHECK_OR_RETURN_ERROR(
       pages != MAP_FAILED,
       AccessFailed,