diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b96c12fbf3..e04760f659d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,11 +99,13 @@ announce_configured_options(CCACHE_PROGRAM) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +# Force logging to always be enabled for this build. if(NOT EXECUTORCH_ENABLE_LOGGING) - # Avoid pulling in the logging strings, which can be large. Note that this - # will set the compiler flag for all targets in this directory, and for all - # subdirectories included after this point. - add_definitions(-DET_LOG_ENABLED=0) + message(STATUS "EXECUTORCH_ENABLE_LOGGING was OFF; forcing it to ON.") + set(EXECUTORCH_ENABLE_LOGGING + ON + CACHE BOOL "Build with ET_LOG_ENABLED" FORCE + ) endif() add_definitions(-DET_MIN_LOG_LEVEL=${ET_MIN_LOG_LEVEL}) diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index ac87d49d5a5..5624f5a4aa0 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -16,8 +16,10 @@ namespace aoti { namespace internal { // Global storage for tensor metadata -std::unordered_map> tensor_to_sizes; -std::unordered_map> tensor_to_strides; +AOTI_SHIM_EXPORT std::unordered_map> + tensor_to_sizes; +AOTI_SHIM_EXPORT std::unordered_map> + tensor_to_strides; } // namespace internal extern "C" { @@ -74,9 +76,7 @@ AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) { for (int i = 0; i < tensor->dim(); i++) { strides[i] = tensor_strides[i]; } - it = - internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides)) - .first; + it = internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides)).first; } // For 0D tensors, data() returns nullptr on empty vectors, but we need to @@ -122,8 +122,7 @@ AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) { for (int i = 0; i < tensor->dim(); i++) { sizes[i] = tensor_sizes[i]; } - it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)) - .first; + it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)).first; } // For 0D tensors, data() returns nullptr on empty vectors, but we need to @@ -200,6 +199,69 @@ void cleanup_tensor_metadata() { internal::tensor_to_strides.clear(); } +void aoti_torch_warn( + const char* func, + const char* file, + uint32_t line, + const char* msg) { + ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg); +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) { + (void)tensor; + (void)ret_size; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor) { + (void)self; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) { + (void)self; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle) { + (void)orig_handle; + (void)new_handle; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( + void* data_ptr, + int64_t ndim, + const int64_t* sizes, + const int64_t* strides, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor) { + (void)data_ptr; + (void)ndim; + (void)sizes; + (void)strides; + (void)storage_offset; + (void)dtype; + (void)device_type; + (void)device_index; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + } // extern "C" } // namespace aoti diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index 1b0429e3aba..40849a9d5af 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -15,6 +15,13 @@ #include #include +#if defined(BUILDING_CUDA_BACKEND) +#include +#define AOTI_SHIM_EXPORT AOTI_CUDA_EXPORT +#else +#define AOTI_SHIM_EXPORT +#endif + namespace executorch { namespace backends { namespace aoti { @@ -23,56 +30,89 @@ namespace aoti { using executorch::runtime::Error; using executorch::runtime::etensor::Tensor; +// Global storage for tensor metadata +extern std::unordered_map> tensor_to_sizes; +extern std::unordered_map> tensor_to_strides; + extern "C" { // Common AOTI type aliases using AOTIRuntimeError = Error; using AOTITorchError = Error; -// Global storage for tensor metadata -extern std::unordered_map> tensor_to_sizes; -extern std::unordered_map> tensor_to_strides; - // Attribute-related operations (memory-irrelevant) -AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr); -AOTITorchError aoti_torch_get_storage_offset( - Tensor* tensor, - int64_t* ret_storage_offset); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_offset(Tensor* tensor, int64_t* ret_storage_offset); -AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides); -AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype); -AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes); -AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); -AOTITorchError aoti_torch_get_device_index( - Tensor* tensor, - int32_t* ret_device_index); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index); -AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); // Utility functions for device and layout information -int32_t aoti_torch_device_type_cpu(); -int32_t aoti_torch_layout_strided(); -int32_t aoti_torch_dtype_float32(); -int32_t aoti_torch_dtype_bfloat16(); -int32_t aoti_torch_dtype_int8(); -int32_t aoti_torch_dtype_int16(); -int32_t aoti_torch_dtype_int32(); -int32_t aoti_torch_dtype_int64(); +AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu(); +AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64(); // Dtype utility function needed by Metal backend -size_t aoti_torch_dtype_element_size(int32_t dtype); +AOTI_SHIM_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype); // Autograd mode functions -int32_t aoti_torch_grad_mode_is_enabled(); -void aoti_torch_grad_mode_set_enabled(bool enabled); +AOTI_SHIM_EXPORT int32_t aoti_torch_grad_mode_is_enabled(); +AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled); // Cleanup functions for clearing global state -void cleanup_tensor_metadata(); +AOTI_SHIM_EXPORT void cleanup_tensor_metadata(); + +AOTI_SHIM_EXPORT void aoti_torch_warn( + const char* func, + const char* file, + uint32_t line, + const char* msg); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle); + +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( + void* data_ptr, + int64_t ndim, + const int64_t* sizes, + const int64_t* strides, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor); } // extern "C" diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 8b94351d469..3d0aa450b44 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -43,8 +43,28 @@ set(_aoti_cuda_sources runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu runtime/platform/platform.cpp + ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp ) -add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) +# Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC +if(MSVC) + add_library(aoti_cuda SHARED ${_aoti_cuda_sources}) + # Define export macros for Windows DLL + target_compile_definitions( + aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS BUILDING_CUDA_BACKEND + ) + # Ensure proper DLL import/export library naming on Windows with + # config-specific paths + set_target_properties( + aoti_cuda + PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via + # AOTI_CUDA_EXPORT + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$ + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$ + ) +else() + add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) +endif() target_include_directories( aoti_cuda PUBLIC ${CUDAToolkit_INCLUDE_DIRS} @@ -64,11 +84,15 @@ target_link_options( # Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries target_link_libraries( - aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS} + aoti_cuda PUBLIC extension_tensor CUDA::cudart ${CMAKE_DL_LIBS} ) # If you need other CUDA libraries, link them similarly: # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...) -executorch_target_link_options_shared_lib(aoti_cuda) + +# Only apply shared lib options on non-Windows platforms +if(NOT MSVC) + executorch_target_link_options_shared_lib(aoti_cuda) +endif() if(BUILD_TESTING) # Add runtime @@ -82,5 +106,7 @@ endif() install( TARGETS aoti_cuda EXPORT ExecuTorchTargets - DESTINATION lib + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin ) diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h new file mode 100644 index 00000000000..c15ac7ee8aa --- /dev/null +++ b/backends/cuda/runtime/CudaBackend.h @@ -0,0 +1,46 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include + +namespace executorch::backends::cuda { + +class AOTI_CUDA_EXPORT ET_EXPERIMENTAL CudaBackend final + : public ::executorch::runtime::BackendInterface { + public: + /** + * Check if the CUDA backend is available. + */ + bool is_available() const override; + + /** + * Initialize the backend with the given context and compile specs. + * Called once per loaded binary blob. + */ + ::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init( + ::executorch::runtime::BackendInitContext& context, + ::executorch::runtime::FreeableBuffer* processed, + ::executorch::runtime::ArrayRef<::executorch::runtime::CompileSpec> + compile_specs) const override; + + /** + * Execute the backend with the given context and arguments. + * Called once per execution. + */ + ::executorch::runtime::Error execute( + ::executorch::runtime::BackendExecutionContext& context, + ::executorch::runtime::DelegateHandle* handle, + ::executorch::runtime::Span<::executorch::runtime::EValue*> args) + const override; + + /** + * Destroy the backend handle and clean up resources. + */ + void destroy(::executorch::runtime::DelegateHandle* handle) const override; +}; + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 0cef859ddfb..029f7ad16bb 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -18,9 +19,10 @@ #include #include -// Include our shim layer headers +// Include class header and shim layer headers #include #include +#include #include #include #include @@ -46,12 +48,9 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; -class ET_EXPERIMENTAL CudaBackend final - : public ::executorch::runtime::BackendInterface { - private: - Error load_function_pointers_into_handle( - void* so_handle, - AOTIDelegateHandle* handle) const { +static Error load_function_pointers_into_handle( + void* so_handle, + AOTIDelegateHandle* handle) { #define LOAD_SYMBOL(member, name) \ do { \ auto symbol_res = get_function(so_handle, #name); \ @@ -61,319 +60,321 @@ class ET_EXPERIMENTAL CudaBackend final handle->member = reinterpret_cast(symbol_res.get()); \ } while (0) - LOAD_SYMBOL(create_with_device, AOTInductorModelContainerCreateWithDevice); + LOAD_SYMBOL(create_with_device, AOTInductorModelContainerCreateWithDevice); - LOAD_SYMBOL(delete_container, AOTInductorModelContainerDelete); + LOAD_SYMBOL(delete_container, AOTInductorModelContainerDelete); - LOAD_SYMBOL(get_num_inputs, AOTInductorModelContainerGetNumInputs); + LOAD_SYMBOL(get_num_inputs, AOTInductorModelContainerGetNumInputs); - LOAD_SYMBOL(get_num_outputs, AOTInductorModelContainerGetNumOutputs); + LOAD_SYMBOL(get_num_outputs, AOTInductorModelContainerGetNumOutputs); - LOAD_SYMBOL(run, AOTInductorModelContainerRun); + LOAD_SYMBOL(run, AOTInductorModelContainerRun); #undef LOAD_SYMBOL - auto symbol_res = - get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob"); - if (symbol_res.ok()) { - handle->update_constants_from_blob = - reinterpret_cast( - symbol_res.get()); - } else { - ET_LOG( - Info, - "Failed to load AOTInductorModelUpdateConstantsFromBlob. This .so is probably compiled on an old version of torch (<2.9.0)"); + auto symbol_res = + get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob"); + if (symbol_res.ok()) { + handle->update_constants_from_blob = + reinterpret_cast( + symbol_res.get()); + } else { + ET_LOG( + Info, + "Failed to load AOTInductorModelUpdateConstantsFromBlob. This .so is probably compiled on an old version of torch (<2.9.0)"); + } + return Error::Ok; +} + +bool CudaBackend::is_available() const { + return 1; +} + +Result CudaBackend::init( + BackendInitContext& context, + FreeableBuffer* processed, + ArrayRef compile_specs) const { + std::string method_name; + for (const CompileSpec& spec : compile_specs) { + if (std::strcmp(spec.key, "method_name") == 0) { + method_name.assign( + static_cast(spec.value.buffer), + spec.value.nbytes); // no nullptr guarantee, so pass size + break; } - return Error::Ok; } - public: - bool is_available() const override { - return 1; + std::string so_blob_key = + method_name.empty() ? "so_blob" : method_name + "_so_blob"; + + const NamedDataMap* named_data_map = context.get_named_data_map(); + auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str()); + ET_CHECK_OR_RETURN_ERROR( + aoti_dso_buffer.ok(), + Internal, + "Failed to get data for key %s: 0x%x", + so_blob_key.c_str(), + static_cast(aoti_dso_buffer.error())); + + // Generate dynamic temporary file path + filesystem::path temp_dir = filesystem::temp_directory_path(); + filesystem::path so_path = + temp_dir / (so_blob_key + to_string(get_process_id()) + ".so"); + + // Create a temporary file + ofstream outfile(so_path, ios::binary); + + // Write the ELF buffer to the temporary file + ET_LOG( + Info, + "Writing %zu bytes to %s", + aoti_dso_buffer->size(), + so_path.c_str()); + + outfile.write( + static_cast(aoti_dso_buffer->data()), + aoti_dso_buffer->size()); + + ET_CHECK_OR_RETURN_ERROR( + outfile, AccessFailed, "Failed to write to file %s", so_path.c_str()); + + // Finish writing the file to disk + outfile.close(); + + // Free the buffer immediately after writing to disk + aoti_dso_buffer->Free(); + // Load the lib + Result lib_handle_res = load_library(so_path); + if (!lib_handle_res.ok()) { + return lib_handle_res.error(); } + void* lib_handle = lib_handle_res.get(); - // Once per loaded binary blob - Result init( - BackendInitContext& context, - FreeableBuffer* processed, // This will be a empty buffer - ArrayRef compile_specs // This will be my empty list - ) const override { - std::string method_name; - for (const CompileSpec& spec : compile_specs) { - if (std::strcmp(spec.key, "method_name") == 0) { - method_name.assign( - static_cast(spec.value.buffer), - spec.value.nbytes); // no nullptr guarantee, so pass size - break; - } - } - - std::string so_blob_key = - method_name.empty() ? "so_blob" : method_name + "_so_blob"; - - const NamedDataMap* named_data_map = context.get_named_data_map(); - auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str()); - ET_CHECK_OR_RETURN_ERROR( - aoti_dso_buffer.ok(), - Internal, - "Failed to get data for key %s: 0x%x", - so_blob_key.c_str(), - static_cast(aoti_dso_buffer.error())); + processed->Free(); - // Generate dynamic temporary file path - filesystem::path temp_dir = filesystem::temp_directory_path(); - filesystem::path so_path = - temp_dir / (so_blob_key + to_string(get_process_id()) + ".so"); + // Create handle and load function pointers into it + AOTIDelegateHandle* handle = new AOTIDelegateHandle(); + handle->so_handle = lib_handle; + handle->so_path = so_path.string(); - // Create a temporary file - ofstream outfile(so_path, ios::binary); + // Load function pointers specific to this handle's shared library + ET_CHECK_OK_OR_RETURN_ERROR( + load_function_pointers_into_handle(lib_handle, handle)); - // Write the ELF buffer to the temporary file - ET_LOG( - Info, - "Writing %zu bytes to %s", - aoti_dso_buffer->size(), - so_path.c_str()); + AOTInductorModelContainerHandle container_handle = nullptr; - outfile.write( - static_cast(aoti_dso_buffer->data()), - aoti_dso_buffer->size()); + ET_CHECK_OK_OR_RETURN_ERROR( + handle->create_with_device(&container_handle, 1, "cuda", nullptr)); - ET_CHECK_OR_RETURN_ERROR( - outfile, AccessFailed, "Failed to write to file %s", so_path.c_str()); + ET_LOG(Info, "container_handle = %p", container_handle); - // Finish writing the file to disk - outfile.close(); + handle->container_handle = container_handle; - // Free the buffer immediately after writing to disk - aoti_dso_buffer->Free(); - // Load the lib - Result lib_handle_res = load_library(so_path); - if (!lib_handle_res.ok()) { - return lib_handle_res.error(); - } - void* lib_handle = lib_handle_res.get(); - - processed->Free(); - - // Create handle and load function pointers into it - AOTIDelegateHandle* handle = new AOTIDelegateHandle(); - handle->so_handle = lib_handle; - handle->so_path = so_path.string(); - - // Load function pointers specific to this handle's shared library - ET_CHECK_OK_OR_RETURN_ERROR( - load_function_pointers_into_handle(lib_handle, handle)); - - AOTInductorModelContainerHandle container_handle = nullptr; - - ET_CHECK_OK_OR_RETURN_ERROR( - handle->create_with_device(&container_handle, 1, "cuda", nullptr)); - - ET_LOG(Info, "container_handle = %p", container_handle); - - handle->container_handle = container_handle; - - // Look into named data map for constant data - std::string weights_blob_key = - method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; - auto buffer_res = named_data_map->get_data(weights_blob_key.c_str()); - if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) { - ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str()); - const void* weights_blob = buffer_res->data(); - // Feed the weights blob into the container. Under the hood it's copying - // weights, so we should free the buffer immediately. - ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob( - handle->container_handle, static_cast(weights_blob))); - buffer_res->Free(); - } - // Create a CUDA stream for asynchronous execution - cudaStream_t cuda_stream; - ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream)); - handle->cuda_stream = static_cast(cuda_stream); - - return (DelegateHandle*)handle; // Return the handle post-processing + // Look into named data map for constant data + std::string weights_blob_key = + method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; + auto buffer_res = named_data_map->get_data(weights_blob_key.c_str()); + if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) { + ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str()); + const void* weights_blob = buffer_res->data(); + // Feed the weights blob into the container. Under the hood it's copying + // weights, so we should free the buffer immediately. + ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob( + handle->container_handle, static_cast(weights_blob))); + buffer_res->Free(); } + // Create a CUDA stream for asynchronous execution + cudaStream_t cuda_stream; + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream)); + handle->cuda_stream = static_cast(cuda_stream); + + return (DelegateHandle*)handle; // Return the handle post-processing +} + +Error CudaBackend::execute( + BackendExecutionContext& context, + DelegateHandle* handle_, + Span args) const { + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + size_t n_inputs; + handle->get_num_inputs(handle->container_handle, &n_inputs); + + size_t n_outputs; + handle->get_num_outputs(handle->container_handle, &n_outputs); + + ET_CHECK_OR_RETURN_ERROR( + n_inputs + n_outputs == args.size(), + InvalidArgument, + "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.", + n_inputs, + n_outputs, + args.size()) + + // NOTE: ExecuTorch tensors are always on CPU/host memory + // We need to create GPU copies for CUDA kernel execution + std::vector gpu_inputs( + n_inputs); // GPU copies for kernel execution + std::vector gpu_outputs( + n_outputs); // GPU tensors for kernel output + + // Process input tensors: ExecuTorch provides CPU tensors, create GPU + // copies + for (int i = 0; i < n_inputs; i++) { + // Get tensor dimensions and properties from ExecuTorch CPU tensor + auto cpu_tensor = &(args[i]->toTensor()); + auto sizes = cpu_tensor->sizes(); + auto scalar_type = cpu_tensor->scalar_type(); + + // Create GPU tensor with same shape + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_input_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 1, // device_type = cuda + 0, // device_index = 0 + &gpu_input_handle); - // Once per execution - Error execute( - BackendExecutionContext& context, - DelegateHandle* handle_, - Span args) const override { - AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; - - size_t n_inputs; - handle->get_num_inputs(handle->container_handle, &n_inputs); + ET_CHECK_OR_RETURN_ERROR( + create_err == Error::Ok, + Internal, + "Failed to create GPU tensor for input %d", + i); - size_t n_outputs; - handle->get_num_outputs(handle->container_handle, &n_outputs); + gpu_inputs[i] = gpu_input_handle; + // Copy data from CPU to GPU ET_CHECK_OR_RETURN_ERROR( - n_inputs + n_outputs == args.size(), - InvalidArgument, - "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.", - n_inputs, - n_outputs, - args.size()) - - // NOTE: ExecuTorch tensors are always on CPU/host memory - // We need to create GPU copies for CUDA kernel execution - std::vector gpu_inputs( - n_inputs); // GPU copies for kernel execution - std::vector gpu_outputs( - n_outputs); // GPU tensors for kernel output - - // Process input tensors: ExecuTorch provides CPU tensors, create GPU - // copies - for (int i = 0; i < n_inputs; i++) { - // Get tensor dimensions and properties from ExecuTorch CPU tensor - auto cpu_tensor = &(args[i]->toTensor()); - auto sizes = cpu_tensor->sizes(); - auto scalar_type = cpu_tensor->scalar_type(); - - // Create GPU tensor with same shape - std::vector sizes_vec(sizes.begin(), sizes.end()); - - AOTITensorHandle gpu_input_handle; - Error create_err = aoti_torch_empty_strided( - sizes_vec.size(), - sizes_vec.data(), - nullptr, // use default strides - static_cast(scalar_type), - 1, // device_type = cuda - 0, // device_index = 0 - &gpu_input_handle); - - ET_CHECK_OR_RETURN_ERROR( - create_err == Error::Ok, - Internal, - "Failed to create GPU tensor for input %d", - i); - - gpu_inputs[i] = gpu_input_handle; - - // Copy data from CPU to GPU - ET_CHECK_OR_RETURN_ERROR( - aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok, - Internal, - "Failed to copy input %d from CPU to GPU", - i); - } - // Process output tensors: create GPU counterparts for ExecuTorch CPU - // tensors - for (int i = 0; i < n_outputs; i++) { - // Get output tensor dimensions from ExecuTorch CPU tensor - auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); - auto sizes = cpu_output_tensor->sizes(); - auto scalar_type = cpu_output_tensor->scalar_type(); - - // Create GPU tensor with same shape for kernel output - std::vector sizes_vec(sizes.begin(), sizes.end()); - - AOTITensorHandle gpu_output_handle; - Error create_err = aoti_torch_empty_strided( - sizes_vec.size(), - sizes_vec.data(), - nullptr, // use default strides - static_cast(scalar_type), - 1, // device_type = cuda - 0, // device_index = 0 - &gpu_output_handle); - - ET_CHECK_OR_RETURN_ERROR( - create_err == Error::Ok, - Internal, - "Failed to create GPU tensor for output %d", - i); - - gpu_outputs[i] = gpu_output_handle; - } - // Run AOTI container with GPU tensors - AOTIRuntimeError error = handle->run( - handle->container_handle, - gpu_inputs.data(), // Use GPU input tensors - n_inputs, - gpu_outputs.data(), // Use GPU output tensors - n_outputs, - handle->cuda_stream, // Pass the actual CUDA stream - nullptr); // proxy_executor_handle can remain nullptr + aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok, + Internal, + "Failed to copy input %d from CPU to GPU", + i); + } + // Process output tensors: create GPU counterparts for ExecuTorch CPU + // tensors + for (int i = 0; i < n_outputs; i++) { + // Get output tensor dimensions from ExecuTorch CPU tensor + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + auto sizes = cpu_output_tensor->sizes(); + auto scalar_type = cpu_output_tensor->scalar_type(); + + // Create GPU tensor with same shape for kernel output + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_output_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 1, // device_type = cuda + 0, // device_index = 0 + &gpu_output_handle); ET_CHECK_OR_RETURN_ERROR( - error == Error::Ok, + create_err == Error::Ok, Internal, - "AOTInductorModelContainerRun failed with error code %d", - error); - - // Copy GPU output results back to CPU output tensors - for (int i = 0; i < n_outputs; i++) { - auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); - // For DYNAMIC_BOUND tensors we try to resize - ET_CHECK_OK_OR_RETURN_ERROR( - resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()), - "Error resizing tensor at output index %d", - i); - ET_CHECK_OK_OR_RETURN_ERROR( - aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0), - "Failed to copy GPU output %d back to CPU", - i); - } + "Failed to create GPU tensor for output %d", + i); - return Error::Ok; + gpu_outputs[i] = gpu_output_handle; + } + // Run AOTI container with GPU tensors + AOTIRuntimeError error = handle->run( + handle->container_handle, + gpu_inputs.data(), // Use GPU input tensors + n_inputs, + gpu_outputs.data(), // Use GPU output tensors + n_outputs, + handle->cuda_stream, // Pass the actual CUDA stream + nullptr); // proxy_executor_handle can remain nullptr + + ET_CHECK_OR_RETURN_ERROR( + error == Error::Ok, + Internal, + "AOTInductorModelContainerRun failed with error code %d", + error); + + // Copy GPU output results back to CPU output tensors + for (int i = 0; i < n_outputs; i++) { + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + // For DYNAMIC_BOUND tensors we try to resize + ET_CHECK_OK_OR_RETURN_ERROR( + resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()), + "Error resizing tensor at output index %d", + i); + ET_CHECK_OK_OR_RETURN_ERROR( + aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0), + "Failed to copy GPU output %d back to CPU", + i); } - void destroy(DelegateHandle* handle_) const override { - if (handle_ == nullptr) { - return; - } - AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; - - // Destroy the CUDA stream if it exists - if (handle->cuda_stream != nullptr) { - cudaStream_t cuda_stream = static_cast(handle->cuda_stream); - cudaError_t stream_err = cudaStreamDestroy(cuda_stream); - ET_CHECK_OR_LOG_ERROR( - stream_err == cudaSuccess, - "Failed to destroy CUDA stream: %s", - cudaGetErrorString(stream_err)); - handle->cuda_stream = nullptr; - } + return Error::Ok; +} - // NOTE: AOTInductorModelContainerDelete does not work correctly with - // multiple .so files. Deleting one container frees shared resources, - // which causes segmentation faults when attempting to delete other - // containers. As a workaround, we skip explicit container deletion - // and defer cleanup to the OS. - // TODO(gasoonjia): Find a proper solution for safe container deletion. - // AOTInductorModelContainerDelete(handle->container_handle); - - // Now close the shared library - auto err = Error::Ok; - if (handle->so_handle != nullptr) { - err = close_library(handle->so_handle); - } +void CudaBackend::destroy(DelegateHandle* handle_) const { + if (handle_ == nullptr) { + return; + } + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + // Destroy the CUDA stream if it exists + if (handle->cuda_stream != nullptr) { + cudaStream_t cuda_stream = static_cast(handle->cuda_stream); + cudaError_t stream_err = cudaStreamDestroy(cuda_stream); + ET_CHECK_OR_LOG_ERROR( + stream_err == cudaSuccess, + "Failed to destroy CUDA stream: %s", + cudaGetErrorString(stream_err)); + handle->cuda_stream = nullptr; + } - // Remove the temporary shared library file - if (!handle->so_path.empty()) { - std::error_code remove_error; - std::filesystem::remove(handle->so_path, remove_error); - ET_CHECK_OR_LOG_ERROR( - !remove_error, - "Failed to remove temporary shared library %s: %s", - handle->so_path.c_str(), - remove_error.message().c_str()); - } + // NOTE: AOTInductorModelContainerDelete does not work correctly with + // multiple .so files. Deleting one container frees shared resources, + // which causes segmentation faults when attempting to delete other + // containers. As a workaround, we skip explicit container deletion + // and defer cleanup to the OS. + // TODO(gasoonjia): Find a proper solution for safe container deletion. + // AOTInductorModelContainerDelete(handle->container_handle); + + // Now close the shared library + auto err = Error::Ok; + if (handle->so_handle != nullptr) { + err = close_library(handle->so_handle); + } - delete handle; - clear_all_tensors(); + // Remove the temporary shared library file + if (!handle->so_path.empty()) { + std::error_code remove_error; + std::filesystem::remove(handle->so_path, remove_error); + ET_CHECK_OR_LOG_ERROR( + !remove_error, + "Failed to remove temporary shared library %s: %s", + handle->so_path.c_str(), + remove_error.message().c_str()); } -}; + + delete handle; + clear_all_tensors(); +} } // namespace executorch::backends::cuda namespace executorch::backends { -namespace { + +// Backend instance - static on all platforms auto cls = cuda::CudaBackend(); -executorch::runtime::Backend backend{"CudaBackend", &cls}; +executorch::runtime::Backend cuda_backend{"CudaBackend", &cls}; + +#ifndef _WIN32 +// On non-Windows platforms, use automatic static initialization +namespace { static executorch::runtime::Error success_with_compiler = - register_backend(backend); + register_backend(cuda_backend); } // namespace +#endif + } // namespace executorch::backends diff --git a/backends/cuda/runtime/cuda_backend_init.h b/backends/cuda/runtime/cuda_backend_init.h new file mode 100644 index 00000000000..a05dc082857 --- /dev/null +++ b/backends/cuda/runtime/cuda_backend_init.h @@ -0,0 +1,35 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 +#define CUDA_BACKEND_INIT_EXPORT __declspec(dllexport) +#define CUDA_BACKEND_INIT_IMPORT __declspec(dllimport) +#else +#define CUDA_BACKEND_INIT_EXPORT __attribute__((visibility("default"))) +#define CUDA_BACKEND_INIT_IMPORT +#endif + +// When building the DLL, define BUILDING_CUDA_BACKEND +// When using the DLL, this will import the function +#ifdef BUILDING_CUDA_BACKEND +#define CUDA_BACKEND_INIT_API CUDA_BACKEND_INIT_EXPORT +#else +#define CUDA_BACKEND_INIT_API CUDA_BACKEND_INIT_IMPORT +#endif + +/** + * Initialize the CUDA backend and register it with the ExecutorTorch runtime. + * On Windows, this must be called explicitly before loading models that use + * the CUDA backend. On other platforms, the backend is registered automatically + * via static initialization. + */ +CUDA_BACKEND_INIT_API void InitCudaBackend(); + +#ifdef __cplusplus +} +#endif diff --git a/backends/cuda/runtime/export.h b/backends/cuda/runtime/export.h new file mode 100644 index 00000000000..65fe7192611 --- /dev/null +++ b/backends/cuda/runtime/export.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +// Define export macro for Windows DLL +// When building the aoti_cuda library, EXPORT_AOTI_FUNCTIONS is defined by +// CMake, which causes this macro to export symbols using __declspec(dllexport). +// When consuming the library, the macro imports symbols using +// __declspec(dllimport). On non-Windows platforms, the macro is empty and has +// no effect. +#ifdef _WIN32 +#ifdef EXPORT_AOTI_FUNCTIONS +#define AOTI_CUDA_EXPORT __declspec(dllexport) +#else +#define AOTI_CUDA_EXPORT __declspec(dllimport) +#endif +#else +#define AOTI_CUDA_EXPORT +#endif diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h index f930f3df643..ec2b381f524 100644 --- a/backends/cuda/runtime/shims/cuda_guard.h +++ b/backends/cuda/runtime/shims/cuda_guard.h @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -33,9 +34,8 @@ using CUDAStreamGuardHandle = CUDAStreamGuard*; * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_cuda_guard( - int32_t device_index, - CUDAGuardHandle* ret_guard); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_create_cuda_guard(int32_t device_index, CUDAGuardHandle* ret_guard); /** * Deletes a CUDA device guard and frees its associated resources. @@ -44,7 +44,8 @@ AOTITorchError aoti_torch_create_cuda_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); /** * Sets the CUDA device to a new index for an existing guard. @@ -54,9 +55,8 @@ AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_cuda_guard_set_index( - CUDAGuardHandle guard, - int32_t device_index); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index); /** * Creates a CUDA stream guard that sets the current device and stream, @@ -69,7 +69,7 @@ AOTITorchError aoti_torch_cuda_guard_set_index( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_cuda_stream_guard( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard( void* stream, int32_t device_index, CUDAStreamGuardHandle* ret_guard); @@ -81,7 +81,8 @@ AOTITorchError aoti_torch_create_cuda_stream_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); /** * Gets the current CUDA stream for a specified device. @@ -91,9 +92,8 @@ AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_get_current_cuda_stream( - int32_t device_index, - void** ret_stream); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream); } // extern "C" diff --git a/backends/cuda/runtime/shims/int4mm.h b/backends/cuda/runtime/shims/int4mm.h index 6bd2d9b3a79..b84a7a4d149 100644 --- a/backends/cuda/runtime/shims/int4mm.h +++ b/backends/cuda/runtime/shims/int4mm.h @@ -10,6 +10,7 @@ #include #include +#include namespace executorch::backends::cuda { @@ -69,7 +70,7 @@ extern "C" { * or invalid qGroupSize * - Error::Internal: CUDA kernel launch failure */ -AOTITorchError aoti_torch_cuda__weight_int4pack_mm( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm( Tensor* self, Tensor* mat2, int64_t qGroupSize, diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h index 7a8d4c3609b..924f268d3ca 100644 --- a/backends/cuda/runtime/shims/memory.h +++ b/backends/cuda/runtime/shims/memory.h @@ -10,6 +10,7 @@ #include #include +#include #include namespace executorch::backends::cuda { @@ -43,7 +44,7 @@ extern "C" { * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_tensor_from_blob_v2( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2( void* data, int64_t ndim, const int64_t* sizes_ptr, @@ -71,7 +72,7 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_empty_strided( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_empty_strided( int64_t ndim, const int64_t* sizes_ptr, const int64_t* strides_ptr, @@ -87,7 +88,7 @@ AOTITorchError aoti_torch_empty_strided( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); /** * Creates a tensor view that reinterprets the same underlying memory with @@ -106,7 +107,7 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); * * @return Error::Ok on success, appropriate error code on failure */ -AOTITorchError aoti_torch__reinterpret_tensor( +AOTI_CUDA_EXPORT AOTITorchError aoti_torch__reinterpret_tensor( Tensor* self, int64_t ndim, const int64_t* sizes_ptr, @@ -136,11 +137,11 @@ AOTITorchError aoti_torch__reinterpret_tensor( * - Error::MemoryAllocationFailed: failed to allocate temporary memory * - Error::Internal: CUDA operation failures */ -AOTITorchError +AOTI_CUDA_EXPORT AOTITorchError aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking); // Function to clear all tensors from internal storage -void clear_all_tensors(); +AOTI_CUDA_EXPORT void clear_all_tensors(); } // extern "C" } // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h index 15a4e397d24..3c881df63b3 100644 --- a/backends/cuda/runtime/shims/tensor_attribute.h +++ b/backends/cuda/runtime/shims/tensor_attribute.h @@ -8,6 +8,7 @@ #pragma once +#include #include #include #include @@ -24,12 +25,11 @@ extern "C" { using AOTITorchError = Error; // Device type functions for tensor attributes -AOTITorchError aoti_torch_get_device_type( - Tensor* tensor, - int32_t* ret_device_type); +AOTI_CUDA_EXPORT AOTITorchError +aoti_torch_get_device_type(Tensor* tensor, int32_t* ret_device_type); // Device type constants -int32_t aoti_torch_device_type_cuda(); +AOTI_CUDA_EXPORT int32_t aoti_torch_device_type_cuda(); } // extern "C" diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 866d17160ba..cfb2acd3759 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -39,18 +39,16 @@ executorch_target_link_options_shared_lib(executorch) set(link_libraries executorch gflags) set(_srcs multimodal.cpp) -list( - APPEND - link_libraries - optimized_native_cpu_ops_lib - quantized_ops_lib - custom_ops - cpublas - eigen_blas -) +# Common ops for all builds +list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas) executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib) -executorch_target_link_options_shared_lib(quantized_ops_lib) -executorch_target_link_options_shared_lib(custom_ops) + +# CPU-only builds need quantized and custom ops +if(NOT EXECUTORCH_BUILD_CUDA AND MSVC) + list(APPEND link_libraries quantized_ops_lib custom_ops) + executorch_target_link_options_shared_lib(quantized_ops_lib) + executorch_target_link_options_shared_lib(custom_ops) +endif() # XNNPACK if(TARGET xnnpack_backend) @@ -86,13 +84,6 @@ list( extension_flat_tensor ) -# Link CUDA backend -if(EXECUTORCH_BUILD_CUDA) - find_package(CUDAToolkit REQUIRED) - list(APPEND link_libraries aoti_cuda) - executorch_target_link_options_shared_lib(aoti_cuda) -endif() - if(EXECUTORCH_BUILD_METAL) list(APPEND link_libraries metal_backend) executorch_target_link_options_shared_lib(metal_backend) @@ -104,11 +95,24 @@ list(APPEND link_libraries tokenizers::tokenizers) add_executable(voxtral_runner ${_srcs}) if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") target_link_options_gc_sections(voxtral_runner) - if(NOT APPLE) + if(NOT APPLE AND NOT MSVC) target_link_options(voxtral_runner PRIVATE "LINKER:-s") endif() endif() +# Link CUDA backend +if(EXECUTORCH_BUILD_CUDA) + find_package(CUDAToolkit REQUIRED) + if(MSVC) + # On MSVC, link directly without shared lib options to use import library + list(APPEND link_libraries aoti_cuda) + else() + # On non-MSVC, use shared lib options + list(APPEND link_libraries aoti_cuda) + executorch_target_link_options_shared_lib(aoti_cuda) + endif() +endif() + target_include_directories(voxtral_runner PUBLIC ${_common_include_directories}) target_link_libraries(voxtral_runner PUBLIC ${link_libraries}) target_compile_options(voxtral_runner PUBLIC ${_common_compile_options}) diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index 29edf955751..ff483cce516 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -25,6 +25,10 @@ #include #include +// Manually register the CUDA backend +#include +#include + #if defined(ET_USE_THREADPOOL) #include #include @@ -283,6 +287,22 @@ MultimodalInput processAudioFile( } // namespace int32_t main(int32_t argc, char** argv) { + // Manually register the CUDA backend (required on Windows, harmless on other + // platforms) + ET_LOG(Info, "Registering CUDA backend"); + static auto cuda_backend_impl = ::executorch::backends::cuda::CudaBackend(); + static auto cuda_backend = + ::executorch::runtime::Backend{"CudaBackend", &cuda_backend_impl}; + auto register_error = ::executorch::runtime::register_backend(cuda_backend); + if (register_error == ::executorch::runtime::Error::Ok) { + ET_LOG(Info, "Successfully registered CudaBackend"); + } else { + ET_LOG( + Error, + "Failed to register CudaBackend: error code %d", + (int)register_error); + } + gflags::ParseCommandLineFlags(&argc, &argv, true); const char* model_path = FLAGS_model_path.c_str(); diff --git a/extension/data_loader/mman_windows.cpp b/extension/data_loader/mman_windows.cpp index 89f9f22f467..dec991376cf 100644 --- a/extension/data_loader/mman_windows.cpp +++ b/extension/data_loader/mman_windows.cpp @@ -21,7 +21,11 @@ #include #include +#include +#include +#define NOMINMAX #include +#undef NOMINMAX #ifndef STATUS_SECTION_TOO_BIG #define STATUS_SECTION_TOO_BIG 0xC0000040L @@ -129,49 +133,42 @@ static DWORD __map_mmap_prot_file(const int prot) { } // namespace -void* mmap(void* addr, size_t len, int prot, int flags, int fildes, off_t off) { +void* mmap( + void* addr, + size_t len, + int prot, + int flags, + int fildes, + std::uint64_t off) { HANDLE fm, h; - void* map = MAP_FAILED; -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4293) -#endif - - const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD)) - ? (DWORD)off - : (DWORD)(off & 0xFFFFFFFFL); - const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) - ? (DWORD)0 - : (DWORD)((off >> 32) & 0xFFFFFFFFL); - const DWORD protect = __map_mmap_prot_page(prot); - const DWORD desiredAccess = __map_mmap_prot_file(prot); - - const off_t maxSize = off + (off_t)len; - - const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD)) - ? (DWORD)maxSize - : (DWORD)(maxSize & 0xFFFFFFFFL); - const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD)) - ? (DWORD)0 - : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL); - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - errno = 0; if (len == 0 /* Unsupported flag combinations */ || (flags & MAP_FIXED) != 0 - /* Usupported protection combinations */ + /* Unsupported protection combinations */ || prot == PROT_EXEC) { errno = EINVAL; return MAP_FAILED; } + if (off > std::numeric_limits::max() - len) { + errno = EINVAL; + return MAP_FAILED; + } + + const std::uint64_t maxSize = off + static_cast(len); + + const DWORD dwFileOffsetLow = static_cast(off & 0xFFFFFFFFULL); + const DWORD dwFileOffsetHigh = static_cast((off >> 32) & 0xFFFFFFFFULL); + const DWORD protect = __map_mmap_prot_page(prot); + const DWORD desiredAccess = __map_mmap_prot_file(prot); + + const DWORD dwMaxSizeLow = static_cast(maxSize & 0xFFFFFFFFULL); + const DWORD dwMaxSizeHigh = static_cast((maxSize >> 32) & 0xFFFFFFFFULL); + h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE; diff --git a/extension/data_loader/mman_windows.h b/extension/data_loader/mman_windows.h index 563db5d8b21..b9e678c121d 100644 --- a/extension/data_loader/mman_windows.h +++ b/extension/data_loader/mman_windows.h @@ -31,6 +31,7 @@ #endif #include +#include #ifdef __cplusplus extern "C" { @@ -56,7 +57,13 @@ extern "C" { #define MS_SYNC 2 #define MS_INVALIDATE 4 -void* mmap(void* addr, size_t len, int prot, int flags, int fildes, off_t off); +void* mmap( + void* addr, + size_t len, + int prot, + int flags, + int fildes, + std::uint64_t off); int munmap(void* addr, size_t len); int mprotect(void* addr, size_t len, int prot); int msync(void* addr, size_t len, int flags); diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp index 10bd2f35f5e..433de270703 100644 --- a/extension/data_loader/mmap_data_loader.cpp +++ b/extension/data_loader/mmap_data_loader.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -94,8 +95,13 @@ Result MmapDataLoader::from( } // Cache the file size. +#if defined(_WIN32) + struct _stat64 st; + int err = ::_fstat64(fd, &st); +#else struct stat st; int err = ::fstat(fd, &st); +#endif if (err < 0) { ET_LOG( Error, @@ -106,7 +112,15 @@ Result MmapDataLoader::from( ::close(fd); return Error::AccessFailed; } - size_t file_size = st.st_size; + + uint64_t file_size_u64 = static_cast(st.st_size); + ET_CHECK_OR_RETURN_ERROR( + file_size_u64 <= std::numeric_limits::max(), + NotSupported, + "File %s is too large (%llu bytes) for current platform", + file_name, + static_cast(file_size_u64)); + size_t file_size = static_cast(file_size_u64); // Copy the filename so we can print better debug messages if reads fail. const char* file_name_copy = ::strdup(file_name); @@ -167,12 +181,6 @@ Error MmapDataLoader::validate_input(size_t offset, size_t size) const { offset, size, file_size_); - ET_CHECK_OR_RETURN_ERROR( - // Recommended by a lint warning. - offset <= std::numeric_limits::max(), - InvalidArgument, - "Offset %zu too large for off_t", - offset); return Error::Ok; } @@ -207,13 +215,19 @@ Result MmapDataLoader::load( // Map the pages read-only. Use shared mappings so that other processes // can also map the same pages and share the same memory. +#if defined(_WIN32) + const std::uint64_t map_offset = static_cast(range.start); +#else + const off_t map_offset = static_cast(range.start); +#endif + void* pages = ::mmap( nullptr, map_size, PROT_READ, MAP_SHARED, fd_, - static_cast(range.start)); + map_offset); ET_CHECK_OR_RETURN_ERROR( pages != MAP_FAILED, AccessFailed, @@ -315,13 +329,19 @@ Error MmapDataLoader::load_into( // Map the pages read-only. MAP_PRIVATE vs. MAP_SHARED doesn't matter since // the data is read-only, but use PRIVATE just to further avoid accidentally // modifying the file. +#if defined(_WIN32) + const std::uint64_t map_offset = static_cast(range.start); +#else + const off_t map_offset = static_cast(range.start); +#endif + void* pages = ::mmap( nullptr, map_size, PROT_READ, MAP_PRIVATE, fd_, - static_cast(range.start)); + map_offset); ET_CHECK_OR_RETURN_ERROR( pages != MAP_FAILED, AccessFailed,