From 118cb0e66714f9059a3a82e9492cddfbbc7a6c8a Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 10:53:48 -0700 Subject: [PATCH 01/34] dont build custom or quantized ops on msvc cuda --- examples/models/voxtral/CMakeLists.txt | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 866d17160ba..76569740354 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -39,18 +39,16 @@ executorch_target_link_options_shared_lib(executorch) set(link_libraries executorch gflags) set(_srcs multimodal.cpp) -list( - APPEND - link_libraries - optimized_native_cpu_ops_lib - quantized_ops_lib - custom_ops - cpublas - eigen_blas -) +# Common ops for all builds +list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas) executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib) -executorch_target_link_options_shared_lib(quantized_ops_lib) -executorch_target_link_options_shared_lib(custom_ops) + +# CPU-only builds need quantized and custom ops +if(NOT EXECUTORCH_BUILD_CUDA AND MSVC) + list(APPEND link_libraries quantized_ops_lib custom_ops) + executorch_target_link_options_shared_lib(quantized_ops_lib) + executorch_target_link_options_shared_lib(custom_ops) +endif() # XNNPACK if(TARGET xnnpack_backend) From cd93bdae20d0817431deafb0cc8a917ce320cb7a Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 12:54:23 -0700 Subject: [PATCH 02/34] dll attempt 1 --- backends/cuda/CMakeLists.txt | 9 ++++++++- examples/models/voxtral/CMakeLists.txt | 10 ++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 8b94351d469..c264540266f 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -44,7 +44,14 @@ set(_aoti_cuda_sources runtime/shims/int4mm.cu runtime/platform/platform.cpp ) -add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) +# Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC +if(MSVC) + add_library(aoti_cuda SHARED ${_aoti_cuda_sources}) + # Define export macro for Windows DLL + target_compile_definitions(aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS) +else() + add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) +endif() target_include_directories( aoti_cuda PUBLIC ${CUDAToolkit_INCLUDE_DIRS} diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 76569740354..4e4367975fe 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -88,7 +88,17 @@ list( if(EXECUTORCH_BUILD_CUDA) find_package(CUDAToolkit REQUIRED) list(APPEND link_libraries aoti_cuda) + # Handle both static library and shared library (.dll on Windows MSVC) executorch_target_link_options_shared_lib(aoti_cuda) + # On MSVC, ensure the DLL is copied to the output directory + if(MSVC) + add_custom_command(TARGET voxtral_runner POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + $ + $ + COMMENT "Copying aoti_cuda.dll to output directory" + ) + endif() endif() if(EXECUTORCH_BUILD_METAL) From 0af3f62c59b4dae5d2a18e2e6d4be43fea04c224 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 13:19:15 -0700 Subject: [PATCH 03/34] add export macro to header definitions --- backends/cuda/runtime/export.h | 24 +++++++++++++++++++ backends/cuda/runtime/shims/cuda_guard.h | 13 +++++----- backends/cuda/runtime/shims/int4mm.h | 3 ++- backends/cuda/runtime/shims/memory.h | 13 +++++----- .../cuda/runtime/shims/tensor_attribute.h | 5 ++-- 5 files changed, 43 insertions(+), 15 deletions(-) create mode 100644 backends/cuda/runtime/export.h diff --git a/backends/cuda/runtime/export.h b/backends/cuda/runtime/export.h new file mode 100644 index 00000000000..23623888e85 --- /dev/null +++ b/backends/cuda/runtime/export.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +// Define export macro for Windows DLL +// When building the aoti_cuda library, EXPORT_AOTI_FUNCTIONS is defined by CMake, +// which causes this macro to export symbols using __declspec(dllexport). +// When consuming the library, the macro imports symbols using __declspec(dllimport). +// On non-Windows platforms, the macro is empty and has no effect. +#ifdef _WIN32 +#ifdef EXPORT_AOTI_FUNCTIONS +#define AOTI_CUDA_EXPORT __declspec(dllexport) +#else +#define AOTI_CUDA_EXPORT __declspec(dllimport) +#endif +#else +#define AOTI_CUDA_EXPORT +#endif diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h index f930f3df643..932ce6ab785 100644 --- a/backends/cuda/runtime/shims/cuda_guard.h +++ b/backends/cuda/runtime/shims/cuda_guard.h @@ -11,6 +11,7 @@ #include #include #include +#include #include namespace executorch::backends::cuda { @@ -33,7 +34,7 @@ using CUDAStreamGuardHandle = CUDAStreamGuard*; * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_cuda_guard( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_guard( int32_t device_index, CUDAGuardHandle* ret_guard); @@ -44,7 +45,7 @@ AOTITorchError aoti_torch_create_cuda_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); /** * Sets the CUDA device to a new index for an existing guard. @@ -54,7 +55,7 @@ AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_cuda_guard_set_index( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_cuda_guard_set_index( CUDAGuardHandle guard, int32_t device_index); @@ -69,7 +70,7 @@ AOTITorchError aoti_torch_cuda_guard_set_index( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_cuda_stream_guard( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard( void* stream, int32_t device_index, CUDAStreamGuardHandle* ret_guard); @@ -81,7 +82,7 @@ AOTITorchError aoti_torch_create_cuda_stream_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); /** * Gets the current CUDA stream for a specified device. @@ -91,7 +92,7 @@ AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_get_current_cuda_stream( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_get_current_cuda_stream( int32_t device_index, void** ret_stream); diff --git a/backends/cuda/runtime/shims/int4mm.h b/backends/cuda/runtime/shims/int4mm.h index 6bd2d9b3a79..b84a7a4d149 100644 --- a/backends/cuda/runtime/shims/int4mm.h +++ b/backends/cuda/runtime/shims/int4mm.h @@ -10,6 +10,7 @@ #include #include +#include namespace executorch::backends::cuda { @@ -69,7 +70,7 @@ extern "C" { * or invalid qGroupSize * - Error::Internal: CUDA kernel launch failure */ -AOTITorchError aoti_torch_cuda__weight_int4pack_mm( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm( Tensor* self, Tensor* mat2, int64_t qGroupSize, diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h index 7a8d4c3609b..924f268d3ca 100644 --- a/backends/cuda/runtime/shims/memory.h +++ b/backends/cuda/runtime/shims/memory.h @@ -10,6 +10,7 @@ #include #include +#include #include namespace executorch::backends::cuda { @@ -43,7 +44,7 @@ extern "C" { * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_tensor_from_blob_v2( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2( void* data, int64_t ndim, const int64_t* sizes_ptr, @@ -71,7 +72,7 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_empty_strided( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_empty_strided( int64_t ndim, const int64_t* sizes_ptr, const int64_t* strides_ptr, @@ -87,7 +88,7 @@ AOTITorchError aoti_torch_empty_strided( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); /** * Creates a tensor view that reinterprets the same underlying memory with @@ -106,7 +107,7 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); * * @return Error::Ok on success, appropriate error code on failure */ -AOTITorchError aoti_torch__reinterpret_tensor( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch__reinterpret_tensor( Tensor* self, int64_t ndim, const int64_t* sizes_ptr, @@ -136,11 +137,11 @@ AOTITorchError aoti_torch__reinterpret_tensor( * - Error::MemoryAllocationFailed: failed to allocate temporary memory * - Error::Internal: CUDA operation failures */ -AOTITorchError +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking); // Function to clear all tensors from internal storage -void clear_all_tensors(); +AOTI_CUDA_EXPORT void clear_all_tensors(); } // extern "C" } // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h index 15a4e397d24..8b4950a154d 100644 --- a/backends/cuda/runtime/shims/tensor_attribute.h +++ b/backends/cuda/runtime/shims/tensor_attribute.h @@ -10,6 +10,7 @@ #include #include +#include #include namespace executorch::backends::cuda { @@ -24,12 +25,12 @@ extern "C" { using AOTITorchError = Error; // Device type functions for tensor attributes -AOTITorchError aoti_torch_get_device_type( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_get_device_type( Tensor* tensor, int32_t* ret_device_type); // Device type constants -int32_t aoti_torch_device_type_cuda(); +AOTI_CUDA_EXPORT int32_t aoti_torch_device_type_cuda(); } // extern "C" From 33682489c9669e4541b9cb3f759b1bc651926d66 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 13:23:55 -0700 Subject: [PATCH 04/34] voxtral cmake --- examples/models/voxtral/CMakeLists.txt | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 4e4367975fe..e2684b05a5c 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -90,15 +90,6 @@ if(EXECUTORCH_BUILD_CUDA) list(APPEND link_libraries aoti_cuda) # Handle both static library and shared library (.dll on Windows MSVC) executorch_target_link_options_shared_lib(aoti_cuda) - # On MSVC, ensure the DLL is copied to the output directory - if(MSVC) - add_custom_command(TARGET voxtral_runner POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different - $ - $ - COMMENT "Copying aoti_cuda.dll to output directory" - ) - endif() endif() if(EXECUTORCH_BUILD_METAL) @@ -110,6 +101,16 @@ endif() list(APPEND link_libraries tokenizers::tokenizers) add_executable(voxtral_runner ${_srcs}) + +# On MSVC, ensure the aoti_cuda DLL is copied to the output directory +if(EXECUTORCH_BUILD_CUDA AND MSVC) + add_custom_command(TARGET voxtral_runner POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + $ + $ + COMMENT "Copying aoti_cuda.dll to output directory" + ) +endif() if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") target_link_options_gc_sections(voxtral_runner) if(NOT APPLE) From 450b6fb15653501efaadf49de80fbdbd06d35199 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 13:43:00 -0700 Subject: [PATCH 05/34] cmake dll install changes --- backends/cuda/CMakeLists.txt | 11 ++++++++++- examples/models/voxtral/CMakeLists.txt | 10 ---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index c264540266f..955991f9f18 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -49,6 +49,13 @@ if(MSVC) add_library(aoti_cuda SHARED ${_aoti_cuda_sources}) # Define export macro for Windows DLL target_compile_definitions(aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS) + # Ensure proper DLL import/export library naming on Windows + set_target_properties(aoti_cuda PROPERTIES + WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via AOTI_CUDA_EXPORT + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + ) else() add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) endif() @@ -89,5 +96,7 @@ endif() install( TARGETS aoti_cuda EXPORT ExecuTorchTargets - DESTINATION lib + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin ) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index e2684b05a5c..c1e52c9095c 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -101,16 +101,6 @@ endif() list(APPEND link_libraries tokenizers::tokenizers) add_executable(voxtral_runner ${_srcs}) - -# On MSVC, ensure the aoti_cuda DLL is copied to the output directory -if(EXECUTORCH_BUILD_CUDA AND MSVC) - add_custom_command(TARGET voxtral_runner POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different - $ - $ - COMMENT "Copying aoti_cuda.dll to output directory" - ) -endif() if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") target_link_options_gc_sections(voxtral_runner) if(NOT APPLE) From 0bbcd7194eef65fcd5dd0b48404ec253fd7722cc Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 14:13:36 -0700 Subject: [PATCH 06/34] more cmake stuff to try --- examples/models/voxtral/CMakeLists.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index c1e52c9095c..978d78a8ba3 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -88,8 +88,11 @@ list( if(EXECUTORCH_BUILD_CUDA) find_package(CUDAToolkit REQUIRED) list(APPEND link_libraries aoti_cuda) - # Handle both static library and shared library (.dll on Windows MSVC) - executorch_target_link_options_shared_lib(aoti_cuda) + # CMake will automatically use the import library on Windows + if(NOT MSVC) + # Only apply shared lib linking options on non-Windows platforms + executorch_target_link_options_shared_lib(aoti_cuda) + endif() endif() if(EXECUTORCH_BUILD_METAL) @@ -103,7 +106,7 @@ list(APPEND link_libraries tokenizers::tokenizers) add_executable(voxtral_runner ${_srcs}) if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") target_link_options_gc_sections(voxtral_runner) - if(NOT APPLE) + if(NOT APPLE AND NOT MSVC) target_link_options(voxtral_runner PRIVATE "LINKER:-s") endif() endif() From efcdfa1c0914623c1eb99e726a563dace02fb7d1 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 15:01:49 -0700 Subject: [PATCH 07/34] hacky work to get backend registration --- backends/cuda/runtime/cuda_backend.cpp | 13 +++++++++++++ examples/models/voxtral/CMakeLists.txt | 22 +++++++++++----------- examples/models/voxtral/multimodal.cpp | 9 +++++++++ 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 0cef859ddfb..a72d866e261 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -376,4 +376,17 @@ executorch::runtime::Backend backend{"CudaBackend", &cls}; static executorch::runtime::Error success_with_compiler = register_backend(backend); } // namespace + +// Export an initialization function to ensure static initializers run on Windows +#ifdef _WIN32 +#define CUDA_BACKEND_INIT_EXPORT __declspec(dllexport) +#else +#define CUDA_BACKEND_INIT_EXPORT __attribute__((visibility("default"))) +#endif + +extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() { + // Force the static initializer to run by referencing it + (void)success_with_compiler; +} + } // namespace executorch::backends diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 978d78a8ba3..3639d938db9 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -84,17 +84,6 @@ list( extension_flat_tensor ) -# Link CUDA backend -if(EXECUTORCH_BUILD_CUDA) - find_package(CUDAToolkit REQUIRED) - list(APPEND link_libraries aoti_cuda) - # CMake will automatically use the import library on Windows - if(NOT MSVC) - # Only apply shared lib linking options on non-Windows platforms - executorch_target_link_options_shared_lib(aoti_cuda) - endif() -endif() - if(EXECUTORCH_BUILD_METAL) list(APPEND link_libraries metal_backend) executorch_target_link_options_shared_lib(metal_backend) @@ -111,6 +100,17 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") endif() endif() +# Link CUDA backend +if(EXECUTORCH_BUILD_CUDA AND NOT MSVC) + find_package(CUDAToolkit REQUIRED) + list(APPEND link_libraries aoti_cuda) + executorch_target_link_options_shared_lib(aoti_cuda) +elseif(EXECUTORCH_BUILD_CUDA AND MSVC) + find_package(CUDAToolkit REQUIRED) + target_link_libraries(voxtral_runner PRIVATE aoti_cuda) + executorch_target_link_options_shared_lib(aoti_cuda) +endif() + target_include_directories(voxtral_runner PUBLIC ${_common_include_directories}) target_link_libraries(voxtral_runner PUBLIC ${link_libraries}) target_compile_options(voxtral_runner PUBLIC ${_common_compile_options}) diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index 29edf955751..0bfdb60dfca 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -282,7 +282,16 @@ MultimodalInput processAudioFile( } // namespace +// Forward declare the initialization function from aoti_cuda +extern "C" void InitCudaBackend(); + int32_t main(int32_t argc, char** argv) { +#ifdef _WIN32 + // On Windows, explicitly initialize the CUDA backend to ensure + // static initializers in the DLL run + InitCudaBackend(); +#endif + gflags::ParseCommandLineFlags(&argc, &argv, true); const char* model_path = FLAGS_model_path.c_str(); From 0b000cd216bd5a51e4d8efc6c064e8b097094987 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 16:02:45 -0700 Subject: [PATCH 08/34] more cmake checks --- backends/cuda/CMakeLists.txt | 6 +++++- examples/models/voxtral/CMakeLists.txt | 16 +++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 955991f9f18..f55d94ec165 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -82,7 +82,11 @@ target_link_libraries( ) # If you need other CUDA libraries, link them similarly: # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...) -executorch_target_link_options_shared_lib(aoti_cuda) + +# Only apply shared lib options on non-Windows platforms +if(NOT MSVC) + executorch_target_link_options_shared_lib(aoti_cuda) +endif() if(BUILD_TESTING) # Add runtime diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 3639d938db9..cfb2acd3759 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -101,14 +101,16 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") endif() # Link CUDA backend -if(EXECUTORCH_BUILD_CUDA AND NOT MSVC) +if(EXECUTORCH_BUILD_CUDA) find_package(CUDAToolkit REQUIRED) - list(APPEND link_libraries aoti_cuda) - executorch_target_link_options_shared_lib(aoti_cuda) -elseif(EXECUTORCH_BUILD_CUDA AND MSVC) - find_package(CUDAToolkit REQUIRED) - target_link_libraries(voxtral_runner PRIVATE aoti_cuda) - executorch_target_link_options_shared_lib(aoti_cuda) + if(MSVC) + # On MSVC, link directly without shared lib options to use import library + list(APPEND link_libraries aoti_cuda) + else() + # On non-MSVC, use shared lib options + list(APPEND link_libraries aoti_cuda) + executorch_target_link_options_shared_lib(aoti_cuda) + endif() endif() target_include_directories(voxtral_runner PUBLIC ${_common_include_directories}) From 6fca29a27341882d27d8ccb686ddb038954b15b0 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 16:08:17 -0700 Subject: [PATCH 09/34] static init hacks --- backends/cuda/runtime/cuda_backend.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index a72d866e261..1e6325fe277 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -371,13 +371,19 @@ class ET_EXPERIMENTAL CudaBackend final namespace executorch::backends { namespace { +// Static backend instance and registration auto cls = cuda::CudaBackend(); executorch::runtime::Backend backend{"CudaBackend", &cls}; + +#ifndef _WIN32 +// On non-Windows platforms, use static initialization static executorch::runtime::Error success_with_compiler = register_backend(backend); +#endif + } // namespace -// Export an initialization function to ensure static initializers run on Windows +// Export an initialization function for explicit backend registration #ifdef _WIN32 #define CUDA_BACKEND_INIT_EXPORT __declspec(dllexport) #else @@ -385,8 +391,18 @@ static executorch::runtime::Error success_with_compiler = #endif extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() { - // Force the static initializer to run by referencing it +#ifdef _WIN32 + // On Windows, explicitly register the backend since DLL static initializers + // don't run reliably + static bool initialized = false; + if (!initialized) { + register_backend(backend); + initialized = true; + } +#else + // On other platforms, static initialization already happened (void)success_with_compiler; +#endif } } // namespace executorch::backends From afadb48000b7cb893cee70005dcbdcf2b44e814f Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 16:11:02 -0700 Subject: [PATCH 10/34] add log --- backends/cuda/runtime/cuda_backend.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 1e6325fe277..945dfda4209 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -391,6 +391,9 @@ static executorch::runtime::Error success_with_compiler = #endif extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() { + ET_LOG( + Info, + "CALLING INITCUDABACKEND"); #ifdef _WIN32 // On Windows, explicitly register the backend since DLL static initializers // don't run reliably From 357bf9f1ba44faafb46e215a0e27e6dd2795f20e Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 16:15:40 -0700 Subject: [PATCH 11/34] more hacks --- examples/models/voxtral/multimodal.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index 0bfdb60dfca..90663ee2158 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -283,13 +283,19 @@ MultimodalInput processAudioFile( } // namespace // Forward declare the initialization function from aoti_cuda +#ifdef _WIN32 +extern "C" __declspec(dllimport) void InitCudaBackend(); +#else extern "C" void InitCudaBackend(); +#endif int32_t main(int32_t argc, char** argv) { #ifdef _WIN32 // On Windows, explicitly initialize the CUDA backend to ensure // static initializers in the DLL run + ET_LOG(Info, "About to call InitCudaBackend"); InitCudaBackend(); + ET_LOG(Info, "InitCudaBackend returned"); #endif gflags::ParseCommandLineFlags(&argc, &argv, true); From d2d8acf021899ecd80cda51a431254a4cbfd0858 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 16:18:36 -0700 Subject: [PATCH 12/34] hacks --- backends/cuda/runtime/cuda_backend.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 945dfda4209..5454213dfca 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -391,19 +391,25 @@ static executorch::runtime::Error success_with_compiler = #endif extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() { - ET_LOG( - Info, - "CALLING INITCUDABACKEND"); #ifdef _WIN32 // On Windows, explicitly register the backend since DLL static initializers // don't run reliably static bool initialized = false; if (!initialized) { - register_backend(backend); + ET_LOG(Info, "Registering CUDA backend on Windows"); + auto error = register_backend(backend); + if (error == executorch::runtime::Error::Ok) { + ET_LOG(Info, "Successfully registered CudaBackend"); + } else { + ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)error); + } initialized = true; + } else { + ET_LOG(Info, "CUDA backend already initialized"); } #else // On other platforms, static initialization already happened + ET_LOG(Info, "CUDA backend using static initialization"); (void)success_with_compiler; #endif } From 3f56f132c982ef17002189916b25f76144e84ece Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 16:25:54 -0700 Subject: [PATCH 13/34] config specific lib and bin locations --- backends/cuda/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index f55d94ec165..7d5cfbedca1 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -49,12 +49,12 @@ if(MSVC) add_library(aoti_cuda SHARED ${_aoti_cuda_sources}) # Define export macro for Windows DLL target_compile_definitions(aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS) - # Ensure proper DLL import/export library naming on Windows + # Ensure proper DLL import/export library naming on Windows with config-specific paths set_target_properties(aoti_cuda PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via AOTI_CUDA_EXPORT - RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib - ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$ + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ ) else() add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) From b3b654d40b17c63e9703a3b2d7c4bf1d1d6dd8aa Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 16:34:35 -0700 Subject: [PATCH 14/34] hacks --- backends/cuda/runtime/cuda_backend.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 5454213dfca..f9be82c4140 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -391,7 +391,11 @@ static executorch::runtime::Error success_with_compiler = #endif extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() { + // Log immediately to confirm function is entered + ET_LOG(Info, "InitCudaBackend: Function entered"); + #ifdef _WIN32 + ET_LOG(Info, "InitCudaBackend: Windows path"); // On Windows, explicitly register the backend since DLL static initializers // don't run reliably static bool initialized = false; @@ -408,10 +412,12 @@ extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() { ET_LOG(Info, "CUDA backend already initialized"); } #else + ET_LOG(Info, "InitCudaBackend: Non-Windows path"); // On other platforms, static initialization already happened - ET_LOG(Info, "CUDA backend using static initialization"); (void)success_with_compiler; #endif + + ET_LOG(Info, "InitCudaBackend: Function exiting"); } } // namespace executorch::backends From cac7849d115b05502a1b237a6d7148b6d846417b Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 16:51:33 -0700 Subject: [PATCH 15/34] hacks --- backends/cuda/CMakeLists.txt | 7 +++-- backends/cuda/runtime/cuda_backend.cpp | 11 ++----- backends/cuda/runtime/cuda_backend_init.h | 35 +++++++++++++++++++++++ examples/models/voxtral/multimodal.cpp | 9 ++---- 4 files changed, 46 insertions(+), 16 deletions(-) create mode 100644 backends/cuda/runtime/cuda_backend_init.h diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 7d5cfbedca1..9befbe5cc56 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -47,8 +47,11 @@ set(_aoti_cuda_sources # Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC if(MSVC) add_library(aoti_cuda SHARED ${_aoti_cuda_sources}) - # Define export macro for Windows DLL - target_compile_definitions(aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS) + # Define export macros for Windows DLL + target_compile_definitions(aoti_cuda PRIVATE + EXPORT_AOTI_FUNCTIONS + BUILDING_CUDA_BACKEND + ) # Ensure proper DLL import/export library naming on Windows with config-specific paths set_target_properties(aoti_cuda PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via AOTI_CUDA_EXPORT diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index f9be82c4140..ed55086e56f 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -21,6 +21,7 @@ // Include our shim layer headers #include #include +#include #include #include #include @@ -383,14 +384,8 @@ static executorch::runtime::Error success_with_compiler = } // namespace -// Export an initialization function for explicit backend registration -#ifdef _WIN32 -#define CUDA_BACKEND_INIT_EXPORT __declspec(dllexport) -#else -#define CUDA_BACKEND_INIT_EXPORT __attribute__((visibility("default"))) -#endif - -extern "C" CUDA_BACKEND_INIT_EXPORT void InitCudaBackend() { +// InitCudaBackend is exported for explicit backend registration on Windows +extern "C" CUDA_BACKEND_INIT_API void InitCudaBackend() { // Log immediately to confirm function is entered ET_LOG(Info, "InitCudaBackend: Function entered"); diff --git a/backends/cuda/runtime/cuda_backend_init.h b/backends/cuda/runtime/cuda_backend_init.h new file mode 100644 index 00000000000..a05dc082857 --- /dev/null +++ b/backends/cuda/runtime/cuda_backend_init.h @@ -0,0 +1,35 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 +#define CUDA_BACKEND_INIT_EXPORT __declspec(dllexport) +#define CUDA_BACKEND_INIT_IMPORT __declspec(dllimport) +#else +#define CUDA_BACKEND_INIT_EXPORT __attribute__((visibility("default"))) +#define CUDA_BACKEND_INIT_IMPORT +#endif + +// When building the DLL, define BUILDING_CUDA_BACKEND +// When using the DLL, this will import the function +#ifdef BUILDING_CUDA_BACKEND +#define CUDA_BACKEND_INIT_API CUDA_BACKEND_INIT_EXPORT +#else +#define CUDA_BACKEND_INIT_API CUDA_BACKEND_INIT_IMPORT +#endif + +/** + * Initialize the CUDA backend and register it with the ExecutorTorch runtime. + * On Windows, this must be called explicitly before loading models that use + * the CUDA backend. On other platforms, the backend is registered automatically + * via static initialization. + */ +CUDA_BACKEND_INIT_API void InitCudaBackend(); + +#ifdef __cplusplus +} +#endif diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index 90663ee2158..9057707b41c 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -282,15 +282,12 @@ MultimodalInput processAudioFile( } // namespace -// Forward declare the initialization function from aoti_cuda -#ifdef _WIN32 -extern "C" __declspec(dllimport) void InitCudaBackend(); -#else -extern "C" void InitCudaBackend(); +#ifdef EXECUTORCH_BUILD_CUDA +#include #endif int32_t main(int32_t argc, char** argv) { -#ifdef _WIN32 +#ifdef EXECUTORCH_BUILD_CUDA // On Windows, explicitly initialize the CUDA backend to ensure // static initializers in the DLL run ET_LOG(Info, "About to call InitCudaBackend"); From 6040e3ab5cc0e9cc01a55df54961301ef58a3697 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 17:05:19 -0700 Subject: [PATCH 16/34] defs hack --- backends/cuda/CMakeLists.txt | 4 ++-- backends/cuda/aoti_cuda.def | 3 +++ backends/cuda/runtime/cuda_backend.cpp | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 backends/cuda/aoti_cuda.def diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 9befbe5cc56..36625c0a038 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -46,7 +46,7 @@ set(_aoti_cuda_sources ) # Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC if(MSVC) - add_library(aoti_cuda SHARED ${_aoti_cuda_sources}) + add_library(aoti_cuda SHARED ${_aoti_cuda_sources} ${CMAKE_CURRENT_SOURCE_DIR}/aoti_cuda.def) # Define export macros for Windows DLL target_compile_definitions(aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS @@ -54,7 +54,7 @@ if(MSVC) ) # Ensure proper DLL import/export library naming on Windows with config-specific paths set_target_properties(aoti_cuda PROPERTIES - WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via AOTI_CUDA_EXPORT + WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via AOTI_CUDA_EXPORT and .def file RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ diff --git a/backends/cuda/aoti_cuda.def b/backends/cuda/aoti_cuda.def new file mode 100644 index 00000000000..a1f7433e077 --- /dev/null +++ b/backends/cuda/aoti_cuda.def @@ -0,0 +1,3 @@ +LIBRARY aoti_cuda +EXPORTS + InitCudaBackend diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index ed55086e56f..1545347bc3d 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -388,6 +389,7 @@ static executorch::runtime::Error success_with_compiler = extern "C" CUDA_BACKEND_INIT_API void InitCudaBackend() { // Log immediately to confirm function is entered ET_LOG(Info, "InitCudaBackend: Function entered"); + assert(1==2) #ifdef _WIN32 ET_LOG(Info, "InitCudaBackend: Windows path"); From 25667f14a4ce911dd52c5b1a835f356ddc2df6a7 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 17:06:13 -0700 Subject: [PATCH 17/34] remove cuda ifdef --- examples/models/voxtral/multimodal.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index 9057707b41c..fd6250a41d9 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -282,18 +282,14 @@ MultimodalInput processAudioFile( } // namespace -#ifdef EXECUTORCH_BUILD_CUDA #include -#endif int32_t main(int32_t argc, char** argv) { -#ifdef EXECUTORCH_BUILD_CUDA // On Windows, explicitly initialize the CUDA backend to ensure // static initializers in the DLL run ET_LOG(Info, "About to call InitCudaBackend"); InitCudaBackend(); ET_LOG(Info, "InitCudaBackend returned"); -#endif gflags::ParseCommandLineFlags(&argc, &argv, true); From 1ae9deda5c35eb41ebe331630c3ceebf6bcc84b1 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 17:10:02 -0700 Subject: [PATCH 18/34] missing semicolon --- backends/cuda/runtime/cuda_backend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 1545347bc3d..7f324b4135a 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -389,7 +389,7 @@ static executorch::runtime::Error success_with_compiler = extern "C" CUDA_BACKEND_INIT_API void InitCudaBackend() { // Log immediately to confirm function is entered ET_LOG(Info, "InitCudaBackend: Function entered"); - assert(1==2) + assert(1==2); #ifdef _WIN32 ET_LOG(Info, "InitCudaBackend: Windows path"); From 53dc564cecd5f57257a0a868326c0c88f454e42a Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 17:37:38 -0700 Subject: [PATCH 19/34] trying something else --- backends/cuda/CMakeLists.txt | 4 +- backends/cuda/aoti_cuda.def | 3 - backends/cuda/runtime/CudaBackend.h | 53 ++++++++++++++++ backends/cuda/runtime/cuda_backend.cpp | 84 +++++++------------------- examples/models/voxtral/multimodal.cpp | 20 ++++-- 5 files changed, 91 insertions(+), 73 deletions(-) delete mode 100644 backends/cuda/aoti_cuda.def create mode 100644 backends/cuda/runtime/CudaBackend.h diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 36625c0a038..9befbe5cc56 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -46,7 +46,7 @@ set(_aoti_cuda_sources ) # Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC if(MSVC) - add_library(aoti_cuda SHARED ${_aoti_cuda_sources} ${CMAKE_CURRENT_SOURCE_DIR}/aoti_cuda.def) + add_library(aoti_cuda SHARED ${_aoti_cuda_sources}) # Define export macros for Windows DLL target_compile_definitions(aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS @@ -54,7 +54,7 @@ if(MSVC) ) # Ensure proper DLL import/export library naming on Windows with config-specific paths set_target_properties(aoti_cuda PROPERTIES - WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via AOTI_CUDA_EXPORT and .def file + WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via AOTI_CUDA_EXPORT RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ diff --git a/backends/cuda/aoti_cuda.def b/backends/cuda/aoti_cuda.def deleted file mode 100644 index a1f7433e077..00000000000 --- a/backends/cuda/aoti_cuda.def +++ /dev/null @@ -1,3 +0,0 @@ -LIBRARY aoti_cuda -EXPORTS - InitCudaBackend diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h new file mode 100644 index 00000000000..ec42191227e --- /dev/null +++ b/backends/cuda/runtime/CudaBackend.h @@ -0,0 +1,53 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include + +namespace executorch::backends::cuda { + +class ET_EXPERIMENTAL CudaBackend final + : public ::executorch::runtime::BackendInterface { + private: + /** + * Load AOTI function pointers from the shared library into the handle. + */ + ::executorch::runtime::Error load_function_pointers_into_handle( + void* so_handle, + struct AOTIDelegateHandle* handle) const; + + public: + /** + * Check if the CUDA backend is available. + */ + bool is_available() const override; + + /** + * Initialize the backend with the given context and compile specs. + * Called once per loaded binary blob. + */ + ::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init( + ::executorch::runtime::BackendInitContext& context, + ::executorch::runtime::FreeableBuffer* processed, + ::executorch::runtime::ArrayRef<::executorch::runtime::CompileSpec> + compile_specs) const override; + + /** + * Execute the backend with the given context and arguments. + * Called once per execution. + */ + ::executorch::runtime::Error execute( + ::executorch::runtime::BackendExecutionContext& context, + ::executorch::runtime::DelegateHandle* handle, + ::executorch::runtime::Span<::executorch::runtime::EValue*> args) + const override; + + /** + * Destroy the backend handle and clean up resources. + */ + void destroy(::executorch::runtime::DelegateHandle* handle) const override; +}; + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 7f324b4135a..5733b6c62e9 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -19,10 +19,10 @@ #include #include -// Include our shim layer headers +// Include class header and shim layer headers +#include #include #include -#include #include #include #include @@ -48,12 +48,9 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; -class ET_EXPERIMENTAL CudaBackend final - : public ::executorch::runtime::BackendInterface { - private: - Error load_function_pointers_into_handle( - void* so_handle, - AOTIDelegateHandle* handle) const { +Error CudaBackend::load_function_pointers_into_handle( + void* so_handle, + AOTIDelegateHandle* handle) const { #define LOAD_SYMBOL(member, name) \ do { \ auto symbol_res = get_function(so_handle, #name); \ @@ -88,17 +85,14 @@ class ET_EXPERIMENTAL CudaBackend final return Error::Ok; } - public: - bool is_available() const override { - return 1; - } +bool CudaBackend::is_available() const { + return 1; +} - // Once per loaded binary blob - Result init( +Result CudaBackend::init( BackendInitContext& context, - FreeableBuffer* processed, // This will be a empty buffer - ArrayRef compile_specs // This will be my empty list - ) const override { + FreeableBuffer* processed, + ArrayRef compile_specs) const { std::string method_name; for (const CompileSpec& spec : compile_specs) { if (std::strcmp(spec.key, "method_name") == 0) { @@ -196,11 +190,10 @@ class ET_EXPERIMENTAL CudaBackend final return (DelegateHandle*)handle; // Return the handle post-processing } - // Once per execution - Error execute( - BackendExecutionContext& context, - DelegateHandle* handle_, - Span args) const override { +Error CudaBackend::execute( + BackendExecutionContext& context, + DelegateHandle* handle_, + Span args) const { AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; size_t n_inputs; @@ -322,7 +315,7 @@ class ET_EXPERIMENTAL CudaBackend final return Error::Ok; } - void destroy(DelegateHandle* handle_) const override { +void CudaBackend::destroy(DelegateHandle* handle_) const { if (handle_ == nullptr) { return; } @@ -367,54 +360,21 @@ class ET_EXPERIMENTAL CudaBackend final delete handle; clear_all_tensors(); } -}; } // namespace executorch::backends::cuda namespace executorch::backends { -namespace { -// Static backend instance and registration + +// Backend instance - static on all platforms auto cls = cuda::CudaBackend(); -executorch::runtime::Backend backend{"CudaBackend", &cls}; +executorch::runtime::Backend cuda_backend{"CudaBackend", &cls}; #ifndef _WIN32 -// On non-Windows platforms, use static initialization +// On non-Windows platforms, use automatic static initialization +namespace { static executorch::runtime::Error success_with_compiler = - register_backend(backend); -#endif - + register_backend(cuda_backend); } // namespace - -// InitCudaBackend is exported for explicit backend registration on Windows -extern "C" CUDA_BACKEND_INIT_API void InitCudaBackend() { - // Log immediately to confirm function is entered - ET_LOG(Info, "InitCudaBackend: Function entered"); - assert(1==2); - -#ifdef _WIN32 - ET_LOG(Info, "InitCudaBackend: Windows path"); - // On Windows, explicitly register the backend since DLL static initializers - // don't run reliably - static bool initialized = false; - if (!initialized) { - ET_LOG(Info, "Registering CUDA backend on Windows"); - auto error = register_backend(backend); - if (error == executorch::runtime::Error::Ok) { - ET_LOG(Info, "Successfully registered CudaBackend"); - } else { - ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)error); - } - initialized = true; - } else { - ET_LOG(Info, "CUDA backend already initialized"); - } -#else - ET_LOG(Info, "InitCudaBackend: Non-Windows path"); - // On other platforms, static initialization already happened - (void)success_with_compiler; #endif - - ET_LOG(Info, "InitCudaBackend: Function exiting"); -} } // namespace executorch::backends diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index fd6250a41d9..c116c111154 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -25,6 +25,10 @@ #include #include +// Manually register the CUDA backend +#include +#include + #if defined(ET_USE_THREADPOOL) #include #include @@ -282,14 +286,18 @@ MultimodalInput processAudioFile( } // namespace -#include int32_t main(int32_t argc, char** argv) { - // On Windows, explicitly initialize the CUDA backend to ensure - // static initializers in the DLL run - ET_LOG(Info, "About to call InitCudaBackend"); - InitCudaBackend(); - ET_LOG(Info, "InitCudaBackend returned"); + // Manually register the CUDA backend (required on Windows, harmless on other platforms) + ET_LOG(Info, "Registering CUDA backend"); + static auto cuda_backend_impl = ::executorch::backends::cuda::CudaBackend(); + static auto cuda_backend = ::executorch::runtime::Backend{"CudaBackend", &cuda_backend_impl}; + auto error = ::executorch::runtime::register_backend(cuda_backend); + if (error == ::executorch::runtime::Error::Ok) { + ET_LOG(Info, "Successfully registered CudaBackend"); + } else { + ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)error); + } gflags::ParseCommandLineFlags(&argc, &argv, true); From daae0c0e1d20ddf31c418343479a684f2894200f Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 17:46:45 -0700 Subject: [PATCH 20/34] var name --- examples/models/voxtral/multimodal.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index c116c111154..ce8f5cc590a 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -292,11 +292,11 @@ int32_t main(int32_t argc, char** argv) { ET_LOG(Info, "Registering CUDA backend"); static auto cuda_backend_impl = ::executorch::backends::cuda::CudaBackend(); static auto cuda_backend = ::executorch::runtime::Backend{"CudaBackend", &cuda_backend_impl}; - auto error = ::executorch::runtime::register_backend(cuda_backend); - if (error == ::executorch::runtime::Error::Ok) { + auto register_error = ::executorch::runtime::register_backend(cuda_backend); + if (register_error == ::executorch::runtime::Error::Ok) { ET_LOG(Info, "Successfully registered CudaBackend"); } else { - ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)error); + ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)register_error); } gflags::ParseCommandLineFlags(&argc, &argv, true); From b9d1a39dfc524ca4b1e901f317f536324abbd9d0 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 17:54:08 -0700 Subject: [PATCH 21/34] new approach --- backends/cuda/runtime/CudaBackend.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h index ec42191227e..22dad0dad6e 100644 --- a/backends/cuda/runtime/CudaBackend.h +++ b/backends/cuda/runtime/CudaBackend.h @@ -2,13 +2,14 @@ #pragma once +#include #include #include #include namespace executorch::backends::cuda { -class ET_EXPERIMENTAL CudaBackend final +class AOTI_CUDA_EXPORT ET_EXPERIMENTAL CudaBackend final : public ::executorch::runtime::BackendInterface { private: /** From c0915f98492af094dffbf7fff25811acff76d1ed Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 17:57:39 -0700 Subject: [PATCH 22/34] test --- backends/cuda/runtime/CudaBackend.h | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h index 22dad0dad6e..1c4fb70422b 100644 --- a/backends/cuda/runtime/CudaBackend.h +++ b/backends/cuda/runtime/CudaBackend.h @@ -2,6 +2,7 @@ #pragma once +#include #include #include #include From 971bf499ac92d877f446b554bf98b129eb43f5eb Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 18:02:32 -0700 Subject: [PATCH 23/34] test --- backends/cuda/runtime/CudaBackend.h | 9 --------- backends/cuda/runtime/cuda_backend.cpp | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h index 1c4fb70422b..c15ac7ee8aa 100644 --- a/backends/cuda/runtime/CudaBackend.h +++ b/backends/cuda/runtime/CudaBackend.h @@ -2,7 +2,6 @@ #pragma once -#include #include #include #include @@ -12,14 +11,6 @@ namespace executorch::backends::cuda { class AOTI_CUDA_EXPORT ET_EXPERIMENTAL CudaBackend final : public ::executorch::runtime::BackendInterface { - private: - /** - * Load AOTI function pointers from the shared library into the handle. - */ - ::executorch::runtime::Error load_function_pointers_into_handle( - void* so_handle, - struct AOTIDelegateHandle* handle) const; - public: /** * Check if the CUDA backend is available. diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 5733b6c62e9..b0bd993a26e 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -48,7 +48,7 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; -Error CudaBackend::load_function_pointers_into_handle( +Error load_function_pointers_into_handle( void* so_handle, AOTIDelegateHandle* handle) const { #define LOAD_SYMBOL(member, name) \ From d1b26ceb59a2c4cce32e57697fe5967edd46bed5 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 18:04:28 -0700 Subject: [PATCH 24/34] remove const --- backends/cuda/runtime/cuda_backend.cpp | 38 +++++++++++++------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index b0bd993a26e..c7d7b634799 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -48,9 +48,9 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; -Error load_function_pointers_into_handle( +static Error load_function_pointers_into_handle( void* so_handle, - AOTIDelegateHandle* handle) const { + AOTIDelegateHandle* handle) { #define LOAD_SYMBOL(member, name) \ do { \ auto symbol_res = get_function(so_handle, #name); \ @@ -60,30 +60,30 @@ Error load_function_pointers_into_handle( handle->member = reinterpret_cast(symbol_res.get()); \ } while (0) - LOAD_SYMBOL(create_with_device, AOTInductorModelContainerCreateWithDevice); + LOAD_SYMBOL(create_with_device, AOTInductorModelContainerCreateWithDevice); - LOAD_SYMBOL(delete_container, AOTInductorModelContainerDelete); + LOAD_SYMBOL(delete_container, AOTInductorModelContainerDelete); - LOAD_SYMBOL(get_num_inputs, AOTInductorModelContainerGetNumInputs); + LOAD_SYMBOL(get_num_inputs, AOTInductorModelContainerGetNumInputs); - LOAD_SYMBOL(get_num_outputs, AOTInductorModelContainerGetNumOutputs); + LOAD_SYMBOL(get_num_outputs, AOTInductorModelContainerGetNumOutputs); - LOAD_SYMBOL(run, AOTInductorModelContainerRun); + LOAD_SYMBOL(run, AOTInductorModelContainerRun); #undef LOAD_SYMBOL - auto symbol_res = - get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob"); - if (symbol_res.ok()) { - handle->update_constants_from_blob = - reinterpret_cast( - symbol_res.get()); - } else { - ET_LOG( - Info, - "Failed to load AOTInductorModelUpdateConstantsFromBlob. This .so is probably compiled on an old version of torch (<2.9.0)"); - } - return Error::Ok; + auto symbol_res = + get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob"); + if (symbol_res.ok()) { + handle->update_constants_from_blob = + reinterpret_cast( + symbol_res.get()); + } else { + ET_LOG( + Info, + "Failed to load AOTInductorModelUpdateConstantsFromBlob. This .so is probably compiled on an old version of torch (<2.9.0)"); } + return Error::Ok; +} bool CudaBackend::is_available() const { return 1; From 9dc28d933656f06f34dce49e27d6925a14aefac6 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 18:34:55 -0700 Subject: [PATCH 25/34] hacks --- CMakeLists.txt | 7 ++-- backends/aoti/common_shims.cpp | 6 ++- backends/aoti/common_shims.h | 67 +++++++++++++++++++++++----------- backends/cuda/CMakeLists.txt | 3 +- 4 files changed, 54 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b96c12fbf3..bf5188acacc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,11 +99,10 @@ announce_configured_options(CCACHE_PROGRAM) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +# Force logging to always be enabled for this build. if(NOT EXECUTORCH_ENABLE_LOGGING) - # Avoid pulling in the logging strings, which can be large. Note that this - # will set the compiler flag for all targets in this directory, and for all - # subdirectories included after this point. - add_definitions(-DET_LOG_ENABLED=0) + message(STATUS "EXECUTORCH_ENABLE_LOGGING was OFF; forcing it to ON.") + set(EXECUTORCH_ENABLE_LOGGING ON CACHE BOOL "Build with ET_LOG_ENABLED" FORCE) endif() add_definitions(-DET_MIN_LOG_LEVEL=${ET_MIN_LOG_LEVEL}) diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index ac87d49d5a5..eca0f91589b 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -16,8 +16,10 @@ namespace aoti { namespace internal { // Global storage for tensor metadata -std::unordered_map> tensor_to_sizes; -std::unordered_map> tensor_to_strides; +AOTI_SHIM_EXPORT std::unordered_map> + tensor_to_sizes; +AOTI_SHIM_EXPORT std::unordered_map> + tensor_to_strides; } // namespace internal extern "C" { diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index 1b0429e3aba..dae712dfdb4 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -15,6 +15,13 @@ #include #include +#if defined(BUILDING_CUDA_BACKEND) +#include +#define AOTI_SHIM_EXPORT AOTI_CUDA_EXPORT +#else +#define AOTI_SHIM_EXPORT +#endif + namespace executorch { namespace backends { namespace aoti { @@ -30,52 +37,68 @@ using AOTIRuntimeError = Error; using AOTITorchError = Error; // Global storage for tensor metadata -extern std::unordered_map> tensor_to_sizes; -extern std::unordered_map> tensor_to_strides; +extern AOTI_SHIM_EXPORT std::unordered_map> + tensor_to_sizes; +extern AOTI_SHIM_EXPORT std::unordered_map> + tensor_to_strides; // Attribute-related operations (memory-irrelevant) -AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr); +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_data_ptr( + Tensor* tensor, + void** ret_data_ptr); -AOTITorchError aoti_torch_get_storage_offset( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_storage_offset( Tensor* tensor, int64_t* ret_storage_offset); -AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides); +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_strides( + Tensor* tensor, + int64_t** ret_strides); -AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype); +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_dtype( + Tensor* tensor, + int32_t* ret_dtype); -AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes); +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_sizes( + Tensor* tensor, + int64_t** ret_sizes); -AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_storage_size( + Tensor* tensor, + int64_t* ret_size); -AOTITorchError aoti_torch_get_device_index( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_device_index( Tensor* tensor, int32_t* ret_device_index); -AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_dim( + Tensor* tensor, + int64_t* ret_dim); // Utility functions for device and layout information -int32_t aoti_torch_device_type_cpu(); -int32_t aoti_torch_layout_strided(); -int32_t aoti_torch_dtype_float32(); -int32_t aoti_torch_dtype_bfloat16(); -int32_t aoti_torch_dtype_int8(); -int32_t aoti_torch_dtype_int16(); -int32_t aoti_torch_dtype_int32(); -int32_t aoti_torch_dtype_int64(); +AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu(); +AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64(); // Dtype utility function needed by Metal backend -size_t aoti_torch_dtype_element_size(int32_t dtype); +AOTI_SHIM_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype); // Autograd mode functions -int32_t aoti_torch_grad_mode_is_enabled(); -void aoti_torch_grad_mode_set_enabled(bool enabled); +AOTI_SHIM_EXPORT int32_t aoti_torch_grad_mode_is_enabled(); +AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled); // Cleanup functions for clearing global state -void cleanup_tensor_metadata(); +AOTI_SHIM_EXPORT void cleanup_tensor_metadata(); } // extern "C" } // namespace aoti } // namespace backends } // namespace executorch + +#undef AOTI_SHIM_EXPORT diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 9befbe5cc56..8cc71dbf437 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -43,6 +43,7 @@ set(_aoti_cuda_sources runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu runtime/platform/platform.cpp + ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp ) # Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC if(MSVC) @@ -81,7 +82,7 @@ target_link_options( # Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries target_link_libraries( - aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS} + aoti_cuda PUBLIC extension_tensor CUDA::cudart ${CMAKE_DL_LIBS} ) # If you need other CUDA libraries, link them similarly: # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...) From 8b4696434b3c24d39b4b1ecb791e9750de1413d5 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 18:40:36 -0700 Subject: [PATCH 26/34] move vectors --- backends/aoti/common_shims.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index dae712dfdb4..162f4d39b78 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -30,18 +30,16 @@ namespace aoti { using executorch::runtime::Error; using executorch::runtime::etensor::Tensor; +// Global storage for tensor metadata +extern std::unordered_map> tensor_to_sizes; +extern std::unordered_map> tensor_to_strides; + extern "C" { // Common AOTI type aliases using AOTIRuntimeError = Error; using AOTITorchError = Error; -// Global storage for tensor metadata -extern AOTI_SHIM_EXPORT std::unordered_map> - tensor_to_sizes; -extern AOTI_SHIM_EXPORT std::unordered_map> - tensor_to_strides; - // Attribute-related operations (memory-irrelevant) AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_data_ptr( Tensor* tensor, @@ -100,5 +98,3 @@ AOTI_SHIM_EXPORT void cleanup_tensor_metadata(); } // namespace aoti } // namespace backends } // namespace executorch - -#undef AOTI_SHIM_EXPORT From cbacd6c4ed9af138e548a13a64cac43a10ec359f Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 19:38:27 -0700 Subject: [PATCH 27/34] lint --- CMakeLists.txt | 5 +- backends/aoti/common_shims.h | 40 +- backends/cuda/CMakeLists.txt | 20 +- backends/cuda/runtime/cuda_backend.cpp | 484 +++++++++--------- backends/cuda/runtime/export.h | 9 +- backends/cuda/runtime/shims/cuda_guard.h | 23 +- .../cuda/runtime/shims/tensor_attribute.h | 7 +- examples/models/voxtral/multimodal.cpp | 14 +- 8 files changed, 301 insertions(+), 301 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bf5188acacc..e04760f659d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,7 +102,10 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # Force logging to always be enabled for this build. if(NOT EXECUTORCH_ENABLE_LOGGING) message(STATUS "EXECUTORCH_ENABLE_LOGGING was OFF; forcing it to ON.") - set(EXECUTORCH_ENABLE_LOGGING ON CACHE BOOL "Build with ET_LOG_ENABLED" FORCE) + set(EXECUTORCH_ENABLE_LOGGING + ON + CACHE BOOL "Build with ET_LOG_ENABLED" FORCE + ) endif() add_definitions(-DET_MIN_LOG_LEVEL=${ET_MIN_LOG_LEVEL}) diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index 162f4d39b78..2b3f32626e4 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -41,37 +41,29 @@ using AOTIRuntimeError = Error; using AOTITorchError = Error; // Attribute-related operations (memory-irrelevant) -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_data_ptr( - Tensor* tensor, - void** ret_data_ptr); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr); -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_storage_offset( - Tensor* tensor, - int64_t* ret_storage_offset); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_offset(Tensor* tensor, int64_t* ret_storage_offset); -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_strides( - Tensor* tensor, - int64_t** ret_strides); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides); -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_dtype( - Tensor* tensor, - int32_t* ret_dtype); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype); -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_sizes( - Tensor* tensor, - int64_t** ret_sizes); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes); -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_storage_size( - Tensor* tensor, - int64_t* ret_size); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_device_index( - Tensor* tensor, - int32_t* ret_device_index); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index); -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_dim( - Tensor* tensor, - int64_t* ret_dim); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); // Utility functions for device and layout information AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu(); diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 8cc71dbf437..3d0aa450b44 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -49,16 +49,18 @@ set(_aoti_cuda_sources if(MSVC) add_library(aoti_cuda SHARED ${_aoti_cuda_sources}) # Define export macros for Windows DLL - target_compile_definitions(aoti_cuda PRIVATE - EXPORT_AOTI_FUNCTIONS - BUILDING_CUDA_BACKEND + target_compile_definitions( + aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS BUILDING_CUDA_BACKEND ) - # Ensure proper DLL import/export library naming on Windows with config-specific paths - set_target_properties(aoti_cuda PROPERTIES - WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via AOTI_CUDA_EXPORT - RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$ - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ - ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ + # Ensure proper DLL import/export library naming on Windows with + # config-specific paths + set_target_properties( + aoti_cuda + PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via + # AOTI_CUDA_EXPORT + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$ + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ ) else() add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index c7d7b634799..029f7ad16bb 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -11,8 +11,8 @@ #include #include #include -#include #include +#include #include #include @@ -20,9 +20,9 @@ #include // Include class header and shim layer headers -#include #include #include +#include #include #include #include @@ -90,277 +90,277 @@ bool CudaBackend::is_available() const { } Result CudaBackend::init( - BackendInitContext& context, - FreeableBuffer* processed, - ArrayRef compile_specs) const { - std::string method_name; - for (const CompileSpec& spec : compile_specs) { - if (std::strcmp(spec.key, "method_name") == 0) { - method_name.assign( - static_cast(spec.value.buffer), - spec.value.nbytes); // no nullptr guarantee, so pass size - break; - } + BackendInitContext& context, + FreeableBuffer* processed, + ArrayRef compile_specs) const { + std::string method_name; + for (const CompileSpec& spec : compile_specs) { + if (std::strcmp(spec.key, "method_name") == 0) { + method_name.assign( + static_cast(spec.value.buffer), + spec.value.nbytes); // no nullptr guarantee, so pass size + break; } + } - std::string so_blob_key = - method_name.empty() ? "so_blob" : method_name + "_so_blob"; - - const NamedDataMap* named_data_map = context.get_named_data_map(); - auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str()); - ET_CHECK_OR_RETURN_ERROR( - aoti_dso_buffer.ok(), - Internal, - "Failed to get data for key %s: 0x%x", - so_blob_key.c_str(), - static_cast(aoti_dso_buffer.error())); - - // Generate dynamic temporary file path - filesystem::path temp_dir = filesystem::temp_directory_path(); - filesystem::path so_path = - temp_dir / (so_blob_key + to_string(get_process_id()) + ".so"); - - // Create a temporary file - ofstream outfile(so_path, ios::binary); - - // Write the ELF buffer to the temporary file - ET_LOG( - Info, - "Writing %zu bytes to %s", - aoti_dso_buffer->size(), - so_path.c_str()); - - outfile.write( - static_cast(aoti_dso_buffer->data()), - aoti_dso_buffer->size()); + std::string so_blob_key = + method_name.empty() ? "so_blob" : method_name + "_so_blob"; + + const NamedDataMap* named_data_map = context.get_named_data_map(); + auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str()); + ET_CHECK_OR_RETURN_ERROR( + aoti_dso_buffer.ok(), + Internal, + "Failed to get data for key %s: 0x%x", + so_blob_key.c_str(), + static_cast(aoti_dso_buffer.error())); + + // Generate dynamic temporary file path + filesystem::path temp_dir = filesystem::temp_directory_path(); + filesystem::path so_path = + temp_dir / (so_blob_key + to_string(get_process_id()) + ".so"); + + // Create a temporary file + ofstream outfile(so_path, ios::binary); + + // Write the ELF buffer to the temporary file + ET_LOG( + Info, + "Writing %zu bytes to %s", + aoti_dso_buffer->size(), + so_path.c_str()); + + outfile.write( + static_cast(aoti_dso_buffer->data()), + aoti_dso_buffer->size()); + + ET_CHECK_OR_RETURN_ERROR( + outfile, AccessFailed, "Failed to write to file %s", so_path.c_str()); + + // Finish writing the file to disk + outfile.close(); + + // Free the buffer immediately after writing to disk + aoti_dso_buffer->Free(); + // Load the lib + Result lib_handle_res = load_library(so_path); + if (!lib_handle_res.ok()) { + return lib_handle_res.error(); + } + void* lib_handle = lib_handle_res.get(); - ET_CHECK_OR_RETURN_ERROR( - outfile, AccessFailed, "Failed to write to file %s", so_path.c_str()); + processed->Free(); - // Finish writing the file to disk - outfile.close(); + // Create handle and load function pointers into it + AOTIDelegateHandle* handle = new AOTIDelegateHandle(); + handle->so_handle = lib_handle; + handle->so_path = so_path.string(); - // Free the buffer immediately after writing to disk - aoti_dso_buffer->Free(); - // Load the lib - Result lib_handle_res = load_library(so_path); - if (!lib_handle_res.ok()) { - return lib_handle_res.error(); - } - void* lib_handle = lib_handle_res.get(); + // Load function pointers specific to this handle's shared library + ET_CHECK_OK_OR_RETURN_ERROR( + load_function_pointers_into_handle(lib_handle, handle)); - processed->Free(); + AOTInductorModelContainerHandle container_handle = nullptr; - // Create handle and load function pointers into it - AOTIDelegateHandle* handle = new AOTIDelegateHandle(); - handle->so_handle = lib_handle; - handle->so_path = so_path.string(); + ET_CHECK_OK_OR_RETURN_ERROR( + handle->create_with_device(&container_handle, 1, "cuda", nullptr)); - // Load function pointers specific to this handle's shared library - ET_CHECK_OK_OR_RETURN_ERROR( - load_function_pointers_into_handle(lib_handle, handle)); + ET_LOG(Info, "container_handle = %p", container_handle); - AOTInductorModelContainerHandle container_handle = nullptr; + handle->container_handle = container_handle; - ET_CHECK_OK_OR_RETURN_ERROR( - handle->create_with_device(&container_handle, 1, "cuda", nullptr)); - - ET_LOG(Info, "container_handle = %p", container_handle); - - handle->container_handle = container_handle; - - // Look into named data map for constant data - std::string weights_blob_key = - method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; - auto buffer_res = named_data_map->get_data(weights_blob_key.c_str()); - if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) { - ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str()); - const void* weights_blob = buffer_res->data(); - // Feed the weights blob into the container. Under the hood it's copying - // weights, so we should free the buffer immediately. - ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob( - handle->container_handle, static_cast(weights_blob))); - buffer_res->Free(); - } - // Create a CUDA stream for asynchronous execution - cudaStream_t cuda_stream; - ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream)); - handle->cuda_stream = static_cast(cuda_stream); - - return (DelegateHandle*)handle; // Return the handle post-processing + // Look into named data map for constant data + std::string weights_blob_key = + method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; + auto buffer_res = named_data_map->get_data(weights_blob_key.c_str()); + if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) { + ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str()); + const void* weights_blob = buffer_res->data(); + // Feed the weights blob into the container. Under the hood it's copying + // weights, so we should free the buffer immediately. + ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob( + handle->container_handle, static_cast(weights_blob))); + buffer_res->Free(); } + // Create a CUDA stream for asynchronous execution + cudaStream_t cuda_stream; + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream)); + handle->cuda_stream = static_cast(cuda_stream); + + return (DelegateHandle*)handle; // Return the handle post-processing +} Error CudaBackend::execute( BackendExecutionContext& context, DelegateHandle* handle_, Span args) const { - AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + size_t n_inputs; + handle->get_num_inputs(handle->container_handle, &n_inputs); + + size_t n_outputs; + handle->get_num_outputs(handle->container_handle, &n_outputs); + + ET_CHECK_OR_RETURN_ERROR( + n_inputs + n_outputs == args.size(), + InvalidArgument, + "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.", + n_inputs, + n_outputs, + args.size()) + + // NOTE: ExecuTorch tensors are always on CPU/host memory + // We need to create GPU copies for CUDA kernel execution + std::vector gpu_inputs( + n_inputs); // GPU copies for kernel execution + std::vector gpu_outputs( + n_outputs); // GPU tensors for kernel output + + // Process input tensors: ExecuTorch provides CPU tensors, create GPU + // copies + for (int i = 0; i < n_inputs; i++) { + // Get tensor dimensions and properties from ExecuTorch CPU tensor + auto cpu_tensor = &(args[i]->toTensor()); + auto sizes = cpu_tensor->sizes(); + auto scalar_type = cpu_tensor->scalar_type(); + + // Create GPU tensor with same shape + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_input_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 1, // device_type = cuda + 0, // device_index = 0 + &gpu_input_handle); - size_t n_inputs; - handle->get_num_inputs(handle->container_handle, &n_inputs); + ET_CHECK_OR_RETURN_ERROR( + create_err == Error::Ok, + Internal, + "Failed to create GPU tensor for input %d", + i); - size_t n_outputs; - handle->get_num_outputs(handle->container_handle, &n_outputs); + gpu_inputs[i] = gpu_input_handle; + // Copy data from CPU to GPU ET_CHECK_OR_RETURN_ERROR( - n_inputs + n_outputs == args.size(), - InvalidArgument, - "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.", - n_inputs, - n_outputs, - args.size()) - - // NOTE: ExecuTorch tensors are always on CPU/host memory - // We need to create GPU copies for CUDA kernel execution - std::vector gpu_inputs( - n_inputs); // GPU copies for kernel execution - std::vector gpu_outputs( - n_outputs); // GPU tensors for kernel output - - // Process input tensors: ExecuTorch provides CPU tensors, create GPU - // copies - for (int i = 0; i < n_inputs; i++) { - // Get tensor dimensions and properties from ExecuTorch CPU tensor - auto cpu_tensor = &(args[i]->toTensor()); - auto sizes = cpu_tensor->sizes(); - auto scalar_type = cpu_tensor->scalar_type(); - - // Create GPU tensor with same shape - std::vector sizes_vec(sizes.begin(), sizes.end()); - - AOTITensorHandle gpu_input_handle; - Error create_err = aoti_torch_empty_strided( - sizes_vec.size(), - sizes_vec.data(), - nullptr, // use default strides - static_cast(scalar_type), - 1, // device_type = cuda - 0, // device_index = 0 - &gpu_input_handle); - - ET_CHECK_OR_RETURN_ERROR( - create_err == Error::Ok, - Internal, - "Failed to create GPU tensor for input %d", - i); - - gpu_inputs[i] = gpu_input_handle; - - // Copy data from CPU to GPU - ET_CHECK_OR_RETURN_ERROR( - aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok, - Internal, - "Failed to copy input %d from CPU to GPU", - i); - } - // Process output tensors: create GPU counterparts for ExecuTorch CPU - // tensors - for (int i = 0; i < n_outputs; i++) { - // Get output tensor dimensions from ExecuTorch CPU tensor - auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); - auto sizes = cpu_output_tensor->sizes(); - auto scalar_type = cpu_output_tensor->scalar_type(); - - // Create GPU tensor with same shape for kernel output - std::vector sizes_vec(sizes.begin(), sizes.end()); - - AOTITensorHandle gpu_output_handle; - Error create_err = aoti_torch_empty_strided( - sizes_vec.size(), - sizes_vec.data(), - nullptr, // use default strides - static_cast(scalar_type), - 1, // device_type = cuda - 0, // device_index = 0 - &gpu_output_handle); - - ET_CHECK_OR_RETURN_ERROR( - create_err == Error::Ok, - Internal, - "Failed to create GPU tensor for output %d", - i); - - gpu_outputs[i] = gpu_output_handle; - } - // Run AOTI container with GPU tensors - AOTIRuntimeError error = handle->run( - handle->container_handle, - gpu_inputs.data(), // Use GPU input tensors - n_inputs, - gpu_outputs.data(), // Use GPU output tensors - n_outputs, - handle->cuda_stream, // Pass the actual CUDA stream - nullptr); // proxy_executor_handle can remain nullptr + aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok, + Internal, + "Failed to copy input %d from CPU to GPU", + i); + } + // Process output tensors: create GPU counterparts for ExecuTorch CPU + // tensors + for (int i = 0; i < n_outputs; i++) { + // Get output tensor dimensions from ExecuTorch CPU tensor + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + auto sizes = cpu_output_tensor->sizes(); + auto scalar_type = cpu_output_tensor->scalar_type(); + + // Create GPU tensor with same shape for kernel output + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_output_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 1, // device_type = cuda + 0, // device_index = 0 + &gpu_output_handle); ET_CHECK_OR_RETURN_ERROR( - error == Error::Ok, + create_err == Error::Ok, Internal, - "AOTInductorModelContainerRun failed with error code %d", - error); - - // Copy GPU output results back to CPU output tensors - for (int i = 0; i < n_outputs; i++) { - auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); - // For DYNAMIC_BOUND tensors we try to resize - ET_CHECK_OK_OR_RETURN_ERROR( - resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()), - "Error resizing tensor at output index %d", - i); - ET_CHECK_OK_OR_RETURN_ERROR( - aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0), - "Failed to copy GPU output %d back to CPU", - i); - } + "Failed to create GPU tensor for output %d", + i); - return Error::Ok; + gpu_outputs[i] = gpu_output_handle; + } + // Run AOTI container with GPU tensors + AOTIRuntimeError error = handle->run( + handle->container_handle, + gpu_inputs.data(), // Use GPU input tensors + n_inputs, + gpu_outputs.data(), // Use GPU output tensors + n_outputs, + handle->cuda_stream, // Pass the actual CUDA stream + nullptr); // proxy_executor_handle can remain nullptr + + ET_CHECK_OR_RETURN_ERROR( + error == Error::Ok, + Internal, + "AOTInductorModelContainerRun failed with error code %d", + error); + + // Copy GPU output results back to CPU output tensors + for (int i = 0; i < n_outputs; i++) { + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + // For DYNAMIC_BOUND tensors we try to resize + ET_CHECK_OK_OR_RETURN_ERROR( + resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()), + "Error resizing tensor at output index %d", + i); + ET_CHECK_OK_OR_RETURN_ERROR( + aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0), + "Failed to copy GPU output %d back to CPU", + i); } -void CudaBackend::destroy(DelegateHandle* handle_) const { - if (handle_ == nullptr) { - return; - } - AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; - - // Destroy the CUDA stream if it exists - if (handle->cuda_stream != nullptr) { - cudaStream_t cuda_stream = static_cast(handle->cuda_stream); - cudaError_t stream_err = cudaStreamDestroy(cuda_stream); - ET_CHECK_OR_LOG_ERROR( - stream_err == cudaSuccess, - "Failed to destroy CUDA stream: %s", - cudaGetErrorString(stream_err)); - handle->cuda_stream = nullptr; - } + return Error::Ok; +} - // NOTE: AOTInductorModelContainerDelete does not work correctly with - // multiple .so files. Deleting one container frees shared resources, - // which causes segmentation faults when attempting to delete other - // containers. As a workaround, we skip explicit container deletion - // and defer cleanup to the OS. - // TODO(gasoonjia): Find a proper solution for safe container deletion. - // AOTInductorModelContainerDelete(handle->container_handle); - - // Now close the shared library - auto err = Error::Ok; - if (handle->so_handle != nullptr) { - err = close_library(handle->so_handle); - } +void CudaBackend::destroy(DelegateHandle* handle_) const { + if (handle_ == nullptr) { + return; + } + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + // Destroy the CUDA stream if it exists + if (handle->cuda_stream != nullptr) { + cudaStream_t cuda_stream = static_cast(handle->cuda_stream); + cudaError_t stream_err = cudaStreamDestroy(cuda_stream); + ET_CHECK_OR_LOG_ERROR( + stream_err == cudaSuccess, + "Failed to destroy CUDA stream: %s", + cudaGetErrorString(stream_err)); + handle->cuda_stream = nullptr; + } - // Remove the temporary shared library file - if (!handle->so_path.empty()) { - std::error_code remove_error; - std::filesystem::remove(handle->so_path, remove_error); - ET_CHECK_OR_LOG_ERROR( - !remove_error, - "Failed to remove temporary shared library %s: %s", - handle->so_path.c_str(), - remove_error.message().c_str()); - } + // NOTE: AOTInductorModelContainerDelete does not work correctly with + // multiple .so files. Deleting one container frees shared resources, + // which causes segmentation faults when attempting to delete other + // containers. As a workaround, we skip explicit container deletion + // and defer cleanup to the OS. + // TODO(gasoonjia): Find a proper solution for safe container deletion. + // AOTInductorModelContainerDelete(handle->container_handle); + + // Now close the shared library + auto err = Error::Ok; + if (handle->so_handle != nullptr) { + err = close_library(handle->so_handle); + } - delete handle; - clear_all_tensors(); + // Remove the temporary shared library file + if (!handle->so_path.empty()) { + std::error_code remove_error; + std::filesystem::remove(handle->so_path, remove_error); + ET_CHECK_OR_LOG_ERROR( + !remove_error, + "Failed to remove temporary shared library %s: %s", + handle->so_path.c_str(), + remove_error.message().c_str()); } + delete handle; + clear_all_tensors(); +} + } // namespace executorch::backends::cuda namespace executorch::backends { diff --git a/backends/cuda/runtime/export.h b/backends/cuda/runtime/export.h index 23623888e85..65fe7192611 100644 --- a/backends/cuda/runtime/export.h +++ b/backends/cuda/runtime/export.h @@ -9,10 +9,11 @@ #pragma once // Define export macro for Windows DLL -// When building the aoti_cuda library, EXPORT_AOTI_FUNCTIONS is defined by CMake, -// which causes this macro to export symbols using __declspec(dllexport). -// When consuming the library, the macro imports symbols using __declspec(dllimport). -// On non-Windows platforms, the macro is empty and has no effect. +// When building the aoti_cuda library, EXPORT_AOTI_FUNCTIONS is defined by +// CMake, which causes this macro to export symbols using __declspec(dllexport). +// When consuming the library, the macro imports symbols using +// __declspec(dllimport). On non-Windows platforms, the macro is empty and has +// no effect. #ifdef _WIN32 #ifdef EXPORT_AOTI_FUNCTIONS #define AOTI_CUDA_EXPORT __declspec(dllexport) diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h index 932ce6ab785..ec2b381f524 100644 --- a/backends/cuda/runtime/shims/cuda_guard.h +++ b/backends/cuda/runtime/shims/cuda_guard.h @@ -10,8 +10,8 @@ #include #include -#include #include +#include #include namespace executorch::backends::cuda { @@ -34,9 +34,8 @@ using CUDAStreamGuardHandle = CUDAStreamGuard*; * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_guard( - int32_t device_index, - CUDAGuardHandle* ret_guard); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_create_cuda_guard(int32_t device_index, CUDAGuardHandle* ret_guard); /** * Deletes a CUDA device guard and frees its associated resources. @@ -45,7 +44,8 @@ AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); /** * Sets the CUDA device to a new index for an existing guard. @@ -55,9 +55,8 @@ AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle gua * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTI_CUDA_EXPORT AOTITorchError aoti_torch_cuda_guard_set_index( - CUDAGuardHandle guard, - int32_t device_index); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index); /** * Creates a CUDA stream guard that sets the current device and stream, @@ -82,7 +81,8 @@ AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); /** * Gets the current CUDA stream for a specified device. @@ -92,9 +92,8 @@ AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGu * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTI_CUDA_EXPORT AOTITorchError aoti_torch_get_current_cuda_stream( - int32_t device_index, - void** ret_stream); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream); } // extern "C" diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h index 8b4950a154d..3c881df63b3 100644 --- a/backends/cuda/runtime/shims/tensor_attribute.h +++ b/backends/cuda/runtime/shims/tensor_attribute.h @@ -8,9 +8,9 @@ #pragma once +#include #include #include -#include #include namespace executorch::backends::cuda { @@ -25,9 +25,8 @@ extern "C" { using AOTITorchError = Error; // Device type functions for tensor attributes -AOTI_CUDA_EXPORT AOTITorchError aoti_torch_get_device_type( - Tensor* tensor, - int32_t* ret_device_type); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_get_device_type(Tensor* tensor, int32_t* ret_device_type); // Device type constants AOTI_CUDA_EXPORT int32_t aoti_torch_device_type_cuda(); diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index ce8f5cc590a..ff483cce516 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -286,19 +286,23 @@ MultimodalInput processAudioFile( } // namespace - int32_t main(int32_t argc, char** argv) { - // Manually register the CUDA backend (required on Windows, harmless on other platforms) + // Manually register the CUDA backend (required on Windows, harmless on other + // platforms) ET_LOG(Info, "Registering CUDA backend"); static auto cuda_backend_impl = ::executorch::backends::cuda::CudaBackend(); - static auto cuda_backend = ::executorch::runtime::Backend{"CudaBackend", &cuda_backend_impl}; + static auto cuda_backend = + ::executorch::runtime::Backend{"CudaBackend", &cuda_backend_impl}; auto register_error = ::executorch::runtime::register_backend(cuda_backend); if (register_error == ::executorch::runtime::Error::Ok) { ET_LOG(Info, "Successfully registered CudaBackend"); } else { - ET_LOG(Error, "Failed to register CudaBackend: error code %d", (int)register_error); + ET_LOG( + Error, + "Failed to register CudaBackend: error code %d", + (int)register_error); } - + gflags::ParseCommandLineFlags(&argc, &argv, true); const char* model_path = FLAGS_model_path.c_str(); From bec6c5fca1cabeab0036139c05bfa4019d640b7d Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 21:04:13 -0700 Subject: [PATCH 28/34] stub --- backends/aoti/common_shims.cpp | 53 +++++++++++++++++++++++++--------- backends/aoti/common_shims.h | 10 +++++++ 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index eca0f91589b..7b1728bf6e4 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -7,6 +7,7 @@ */ #include +#include #include #include @@ -14,13 +15,11 @@ namespace executorch { namespace backends { namespace aoti { -namespace internal { // Global storage for tensor metadata AOTI_SHIM_EXPORT std::unordered_map> tensor_to_sizes; AOTI_SHIM_EXPORT std::unordered_map> tensor_to_strides; -} // namespace internal extern "C" { @@ -52,10 +51,10 @@ AOTITorchError aoti_torch_get_storage_offset( } AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) { - auto it = internal::tensor_to_strides.find(tensor); + auto it = tensor_to_strides.find(tensor); bool needs_update = false; - if (it == internal::tensor_to_strides.end()) { + if (it == tensor_to_strides.end()) { needs_update = true; } else { // CRITICAL: Multimodal models reuse tensors with different shapes across @@ -76,9 +75,7 @@ AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) { for (int i = 0; i < tensor->dim(); i++) { strides[i] = tensor_strides[i]; } - it = - internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides)) - .first; + it = tensor_to_strides.insert_or_assign(tensor, std::move(strides)).first; } // For 0D tensors, data() returns nullptr on empty vectors, but we need to @@ -100,10 +97,10 @@ AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) { } AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) { - auto it = internal::tensor_to_sizes.find(tensor); + auto it = tensor_to_sizes.find(tensor); bool needs_update = false; - if (it == internal::tensor_to_sizes.end()) { + if (it == tensor_to_sizes.end()) { needs_update = true; } else { // CRITICAL: Multimodal models reuse tensors with different shapes across @@ -124,8 +121,7 @@ AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) { for (int i = 0; i < tensor->dim(); i++) { sizes[i] = tensor_sizes[i]; } - it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)) - .first; + it = tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)).first; } // For 0D tensors, data() returns nullptr on empty vectors, but we need to @@ -198,8 +194,39 @@ size_t aoti_torch_dtype_element_size(int32_t dtype) { // Cleanup functions void cleanup_tensor_metadata() { - internal::tensor_to_sizes.clear(); - internal::tensor_to_strides.clear(); + tensor_to_sizes.clear(); + tensor_to_strides.clear(); +} + +void aoti_torch_warn( + const char* func, + const char* file, + uint32_t line, + const char* msg) { + ET_LOG(Warning, "[%s:%u] %s: %s", file, line, func, msg); +} + +AOTITorchError aoti_torch_clone_preserve_strides( + Tensor* self, + Tensor** ret_new_tensor) { + ET_CHECK_OR_RETURN_ERROR( + self != nullptr, + InvalidArgument, + "aoti_torch_clone_preserve_strides failed: self tensor is null"); + ET_CHECK_OR_RETURN_ERROR( + ret_new_tensor != nullptr, + InvalidArgument, + "aoti_torch_clone_preserve_strides failed: ret_new_tensor is null"); + + auto cloned = executorch::extension::clone_tensor_ptr(*self); + ET_CHECK_OR_RETURN_ERROR( + cloned != nullptr, + InvalidArgument, + "aoti_torch_clone_preserve_strides failed: tensor clone returned null"); + + *ret_new_tensor = new Tensor(*cloned); + + return Error::Ok; } } // extern "C" diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index 2b3f32626e4..e95b2b74cbc 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -85,6 +85,16 @@ AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled); // Cleanup functions for clearing global state AOTI_SHIM_EXPORT void cleanup_tensor_metadata(); +AOTI_SHIM_EXPORT void aoti_torch_warn( + const char* func, + const char* file, + uint32_t line, + const char* msg); + +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_clone_preserve_strides( + Tensor* self, + Tensor** ret_new_tensor); + } // extern "C" } // namespace aoti From d8fafd34ef83c50e7f3bb81a11f9bda20951a203 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 21:13:39 -0700 Subject: [PATCH 29/34] stubbed shims --- backends/aoti/common_shims.cpp | 72 ++++++++++++++++++++++++---------- backends/aoti/common_shims.h | 23 ++++++++++- 2 files changed, 73 insertions(+), 22 deletions(-) diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index 7b1728bf6e4..e8c56a45946 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -206,27 +206,59 @@ void aoti_torch_warn( ET_LOG(Warning, "[%s:%u] %s: %s", file, line, func, msg); } -AOTITorchError aoti_torch_clone_preserve_strides( - Tensor* self, - Tensor** ret_new_tensor) { - ET_CHECK_OR_RETURN_ERROR( - self != nullptr, - InvalidArgument, - "aoti_torch_clone_preserve_strides failed: self tensor is null"); - ET_CHECK_OR_RETURN_ERROR( - ret_new_tensor != nullptr, - InvalidArgument, - "aoti_torch_clone_preserve_strides failed: ret_new_tensor is null"); - - auto cloned = executorch::extension::clone_tensor_ptr(*self); - ET_CHECK_OR_RETURN_ERROR( - cloned != nullptr, - InvalidArgument, - "aoti_torch_clone_preserve_strides failed: tensor clone returned null"); - - *ret_new_tensor = new Tensor(*cloned); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) { + (void)tensor; + (void)ret_size; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} - return Error::Ok; +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor) { + (void)self; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) { + (void)self; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle) { + (void)orig_handle; + (void)new_handle; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( + void* data_ptr, + int64_t ndim, + const int64_t* sizes, + const int64_t* strides, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor) { + (void)data_ptr; + (void)ndim; + (void)sizes; + (void)strides; + (void)storage_offset; + (void)dtype; + (void)device_type; + (void)device_index; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; } } // extern "C" diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index e95b2b74cbc..40849a9d5af 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -91,8 +91,27 @@ AOTI_SHIM_EXPORT void aoti_torch_warn( uint32_t line, const char* msg); -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_clone_preserve_strides( - Tensor* self, +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle); + +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( + void* data_ptr, + int64_t ndim, + const int64_t* sizes, + const int64_t* strides, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, Tensor** ret_new_tensor); } // extern "C" From d0b90342373252d8488d614b15f438d5ef41455b Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 21:24:12 -0700 Subject: [PATCH 30/34] undo internal change --- backends/aoti/common_shims.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index e8c56a45946..d6e340d99b8 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -7,7 +7,6 @@ */ #include -#include #include #include @@ -15,11 +14,13 @@ namespace executorch { namespace backends { namespace aoti { +namespace internal { // Global storage for tensor metadata AOTI_SHIM_EXPORT std::unordered_map> tensor_to_sizes; AOTI_SHIM_EXPORT std::unordered_map> tensor_to_strides; +} // namespace internal extern "C" { @@ -51,10 +52,10 @@ AOTITorchError aoti_torch_get_storage_offset( } AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) { - auto it = tensor_to_strides.find(tensor); + auto it = internal::tensor_to_strides.find(tensor); bool needs_update = false; - if (it == tensor_to_strides.end()) { + if (it == internal::tensor_to_strides.end()) { needs_update = true; } else { // CRITICAL: Multimodal models reuse tensors with different shapes across @@ -75,7 +76,7 @@ AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) { for (int i = 0; i < tensor->dim(); i++) { strides[i] = tensor_strides[i]; } - it = tensor_to_strides.insert_or_assign(tensor, std::move(strides)).first; + it = internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides)).first; } // For 0D tensors, data() returns nullptr on empty vectors, but we need to @@ -97,10 +98,10 @@ AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) { } AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) { - auto it = tensor_to_sizes.find(tensor); + auto it = internal::tensor_to_sizes.find(tensor); bool needs_update = false; - if (it == tensor_to_sizes.end()) { + if (it == internal::tensor_to_sizes.end()) { needs_update = true; } else { // CRITICAL: Multimodal models reuse tensors with different shapes across @@ -121,7 +122,7 @@ AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) { for (int i = 0; i < tensor->dim(); i++) { sizes[i] = tensor_sizes[i]; } - it = tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)).first; + it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)).first; } // For 0D tensors, data() returns nullptr on empty vectors, but we need to @@ -194,8 +195,8 @@ size_t aoti_torch_dtype_element_size(int32_t dtype) { // Cleanup functions void cleanup_tensor_metadata() { - tensor_to_sizes.clear(); - tensor_to_strides.clear(); + internal::tensor_to_sizes.clear(); + internal::tensor_to_strides.clear(); } void aoti_torch_warn( From e44d74b95dccd38da3b57a61fee05612d6f055ac Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 21:25:39 -0700 Subject: [PATCH 31/34] we dont have warn level --- backends/aoti/common_shims.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index d6e340d99b8..5624f5a4aa0 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -204,7 +204,7 @@ void aoti_torch_warn( const char* file, uint32_t line, const char* msg) { - ET_LOG(Warning, "[%s:%u] %s: %s", file, line, func, msg); + ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg); } AOTI_SHIM_EXPORT AOTITorchError From ae6157531660e74016e994b9dd186b58c71a779c Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 22:01:56 -0700 Subject: [PATCH 32/34] hacky mmap change --- extension/data_loader/mmap_data_loader.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp index 10bd2f35f5e..541fc36bfa7 100644 --- a/extension/data_loader/mmap_data_loader.cpp +++ b/extension/data_loader/mmap_data_loader.cpp @@ -94,8 +94,13 @@ Result MmapDataLoader::from( } // Cache the file size. +#if defined(_WIN32) + struct _stat64 st; + int err = ::_fstat64(fd, &st); +#else struct stat st; int err = ::fstat(fd, &st); +#endif if (err < 0) { ET_LOG( Error, @@ -106,7 +111,15 @@ Result MmapDataLoader::from( ::close(fd); return Error::AccessFailed; } - size_t file_size = st.st_size; + + uint64_t file_size_u64 = static_cast(st.st_size); + ET_CHECK_OR_RETURN_ERROR( + file_size_u64 <= std::numeric_limits::max(), + NotSupported, + "File %s is too large (%llu bytes) for current platform", + file_name, + static_cast(file_size_u64)); + size_t file_size = static_cast(file_size_u64); // Copy the filename so we can print better debug messages if reads fail. const char* file_name_copy = ::strdup(file_name); From 1d661c42188c17ad9a700ff6d8a59033e818d6bc Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 22:11:51 -0700 Subject: [PATCH 33/34] more hacky mmap edits --- extension/data_loader/mman_windows.cpp | 55 ++++++++++------------ extension/data_loader/mman_windows.h | 9 +++- extension/data_loader/mmap_data_loader.cpp | 23 +++++---- 3 files changed, 48 insertions(+), 39 deletions(-) diff --git a/extension/data_loader/mman_windows.cpp b/extension/data_loader/mman_windows.cpp index 89f9f22f467..847b2aefadd 100644 --- a/extension/data_loader/mman_windows.cpp +++ b/extension/data_loader/mman_windows.cpp @@ -21,6 +21,8 @@ #include #include +#include +#include #include #ifndef STATUS_SECTION_TOO_BIG @@ -129,49 +131,42 @@ static DWORD __map_mmap_prot_file(const int prot) { } // namespace -void* mmap(void* addr, size_t len, int prot, int flags, int fildes, off_t off) { +void* mmap( + void* addr, + size_t len, + int prot, + int flags, + int fildes, + std::uint64_t off) { HANDLE fm, h; - void* map = MAP_FAILED; -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4293) -#endif - - const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD)) - ? (DWORD)off - : (DWORD)(off & 0xFFFFFFFFL); - const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) - ? (DWORD)0 - : (DWORD)((off >> 32) & 0xFFFFFFFFL); - const DWORD protect = __map_mmap_prot_page(prot); - const DWORD desiredAccess = __map_mmap_prot_file(prot); - - const off_t maxSize = off + (off_t)len; - - const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD)) - ? (DWORD)maxSize - : (DWORD)(maxSize & 0xFFFFFFFFL); - const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD)) - ? (DWORD)0 - : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL); - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - errno = 0; if (len == 0 /* Unsupported flag combinations */ || (flags & MAP_FIXED) != 0 - /* Usupported protection combinations */ + /* Unsupported protection combinations */ || prot == PROT_EXEC) { errno = EINVAL; return MAP_FAILED; } + if (off > std::numeric_limits::max() - len) { + errno = EINVAL; + return MAP_FAILED; + } + + const std::uint64_t maxSize = off + static_cast(len); + + const DWORD dwFileOffsetLow = static_cast(off & 0xFFFFFFFFULL); + const DWORD dwFileOffsetHigh = static_cast((off >> 32) & 0xFFFFFFFFULL); + const DWORD protect = __map_mmap_prot_page(prot); + const DWORD desiredAccess = __map_mmap_prot_file(prot); + + const DWORD dwMaxSizeLow = static_cast(maxSize & 0xFFFFFFFFULL); + const DWORD dwMaxSizeHigh = static_cast((maxSize >> 32) & 0xFFFFFFFFULL); + h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE; diff --git a/extension/data_loader/mman_windows.h b/extension/data_loader/mman_windows.h index 563db5d8b21..b9e678c121d 100644 --- a/extension/data_loader/mman_windows.h +++ b/extension/data_loader/mman_windows.h @@ -31,6 +31,7 @@ #endif #include +#include #ifdef __cplusplus extern "C" { @@ -56,7 +57,13 @@ extern "C" { #define MS_SYNC 2 #define MS_INVALIDATE 4 -void* mmap(void* addr, size_t len, int prot, int flags, int fildes, off_t off); +void* mmap( + void* addr, + size_t len, + int prot, + int flags, + int fildes, + std::uint64_t off); int munmap(void* addr, size_t len); int mprotect(void* addr, size_t len, int prot); int msync(void* addr, size_t len, int flags); diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp index 541fc36bfa7..433de270703 100644 --- a/extension/data_loader/mmap_data_loader.cpp +++ b/extension/data_loader/mmap_data_loader.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -180,12 +181,6 @@ Error MmapDataLoader::validate_input(size_t offset, size_t size) const { offset, size, file_size_); - ET_CHECK_OR_RETURN_ERROR( - // Recommended by a lint warning. - offset <= std::numeric_limits::max(), - InvalidArgument, - "Offset %zu too large for off_t", - offset); return Error::Ok; } @@ -220,13 +215,19 @@ Result MmapDataLoader::load( // Map the pages read-only. Use shared mappings so that other processes // can also map the same pages and share the same memory. +#if defined(_WIN32) + const std::uint64_t map_offset = static_cast(range.start); +#else + const off_t map_offset = static_cast(range.start); +#endif + void* pages = ::mmap( nullptr, map_size, PROT_READ, MAP_SHARED, fd_, - static_cast(range.start)); + map_offset); ET_CHECK_OR_RETURN_ERROR( pages != MAP_FAILED, AccessFailed, @@ -328,13 +329,19 @@ Error MmapDataLoader::load_into( // Map the pages read-only. MAP_PRIVATE vs. MAP_SHARED doesn't matter since // the data is read-only, but use PRIVATE just to further avoid accidentally // modifying the file. +#if defined(_WIN32) + const std::uint64_t map_offset = static_cast(range.start); +#else + const off_t map_offset = static_cast(range.start); +#endif + void* pages = ::mmap( nullptr, map_size, PROT_READ, MAP_PRIVATE, fd_, - static_cast(range.start)); + map_offset); ET_CHECK_OR_RETURN_ERROR( pages != MAP_FAILED, AccessFailed, From 0ab474659090397b99d745d604c6bf13ea55ceef Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Sat, 18 Oct 2025 22:15:32 -0700 Subject: [PATCH 34/34] minmax issue --- extension/data_loader/mman_windows.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/extension/data_loader/mman_windows.cpp b/extension/data_loader/mman_windows.cpp index 847b2aefadd..dec991376cf 100644 --- a/extension/data_loader/mman_windows.cpp +++ b/extension/data_loader/mman_windows.cpp @@ -23,7 +23,9 @@ #include #include #include +#define NOMINMAX #include +#undef NOMINMAX #ifndef STATUS_SECTION_TOO_BIG #define STATUS_SECTION_TOO_BIG 0xC0000040L