diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index 8449809ffe3..34063a23374 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -63,6 +63,14 @@ build_cmake_executor_runner() { ${COMMON} \ -B${CMAKE_OUTPUT_DIR} . cmake --build ${CMAKE_OUTPUT_DIR} -j4 + elif [[ "$backend_string_select" == "CUDA" ]]; then + echo "Backend $backend_string_select selected" + cmake -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_CUDA=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + ${COMMON} \ + -B${CMAKE_OUTPUT_DIR} . + cmake --build ${CMAKE_OUTPUT_DIR} -j4 else cmake -DCMAKE_BUILD_TYPE=Debug \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ @@ -323,6 +331,13 @@ test_model_with_mediatek() { EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "*.pte" -print -quit) } +test_model_with_cuda() { + # Export a basic .pte and .ptd, then run the model. + "${PYTHON_EXECUTABLE}" -m examples.cuda.scripts.export --model_name="${MODEL_NAME}" --output_dir "./" + build_cmake_executor_runner "CUDA" + ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte" --data_path "./aoti_cuda_blob.ptd" +} + if [[ "${BACKEND}" == "portable" ]]; then echo "Testing ${MODEL_NAME} with portable kernels..." @@ -375,6 +390,12 @@ elif [[ "${BACKEND}" == "mediatek" ]]; then if [[ $? -eq 0 ]]; then prepare_artifacts_upload fi +elif [[ "${BACKEND}" == "cuda" ]]; then + echo "Testing ${MODEL_NAME} with cuda..." + test_model_with_cuda + if [[ $? -eq 0 ]]; then + prepare_artifacts_upload + fi else set +e if [[ "${BACKEND}" == *"quantization"* ]]; then diff --git a/.github/workflows/test-cuda-builds.yml b/.github/workflows/cuda.yml similarity index 72% rename from .github/workflows/test-cuda-builds.yml rename to .github/workflows/cuda.yml index 5e054c1de84..8724fab99d4 100644 --- a/.github/workflows/test-cuda-builds.yml +++ b/.github/workflows/cuda.yml @@ -61,3 +61,28 @@ jobs: else echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!" fi + + test-models-cuda: + name: test-models-cuda + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + matrix: + model: [linear, add, add_mul, resnet18] + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda diff --git a/.lintrunner.toml b/.lintrunner.toml index ef771bdb9df..b366c141799 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -219,6 +219,7 @@ exclude_patterns = [ '**/*.gif', 'extension/llm/tokenizers', 'extension/llm/tokenizers/**', + 'examples/cuda', # File contains @generated 'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h', 'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h', diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a36d7e563a..5a616465f13 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -587,6 +587,16 @@ endif() if(EXECUTORCH_BUILD_CORTEX_M) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m) + list(APPEND _executorch_backends coretex_m_backend) +endif() + +if(EXECUTORCH_BUILD_CUDA) + # Build common AOTI functionality (required for CUDA) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti) + # Build CUDA-specific AOTI functionality + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda) + # Add aoti_cuda to backends - it already depends on aoti_common + list(APPEND _executorch_backends aoti_cuda) endif() if(EXECUTORCH_BUILD_EXTENSION_APPLE) diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index 2aa8a5692ac..852359770ba 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -30,7 +30,9 @@ set(_aoti_common_sources aoti_model_container.cpp common_shims.cpp) add_library(aoti_common STATIC ${_aoti_common_sources}) target_include_directories( aoti_common - PUBLIC $ $ + PUBLIC $ + $ + $ # PyTorch AOTI headers from ExecuTorch's torch detection ${TORCH_INCLUDE_DIRS} ) diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h index 4b20aefc976..844bd2d5a77 100644 --- a/backends/aoti/aoti_model_container.h +++ b/backends/aoti/aoti_model_container.h @@ -21,6 +21,7 @@ using executorch::runtime::etensor::Tensor; extern "C" { // Type definitions +using AOTITensorHandle = Tensor*; using AOTIRuntimeError = Error; // Forward declarations for AOT Inductor model container @@ -74,6 +75,7 @@ extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun; // AOTI Delegate Handle structure struct AOTIDelegateHandle { void* so_handle; + std::string so_path; AOTInductorModelContainerHandle container_handle; }; diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt new file mode 100644 index 00000000000..90588218c02 --- /dev/null +++ b/backends/cuda/CMakeLists.txt @@ -0,0 +1,69 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Build AOTI CUDA backend for runtime. +# +# ### Editing this file ### +# +# This file should be formatted with +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ +# It should also be cmake-lint clean. +# +cmake_minimum_required(VERSION 3.29) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_STANDARD_REQUIRED ON) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +find_package(CUDAToolkit REQUIRED) + +# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) +find_package_torch() + +# CUDA-specific AOTI functionality +set(_aoti_cuda_sources runtime/cuda_backend.cpp runtime/shims/memory.cpp + runtime/shims/tensor_attribute.cpp +) +add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) +target_include_directories( + aoti_cuda + PUBLIC ${CUDAToolkit_INCLUDE_DIRS} + $ + $ + # PyTorch AOTI headers from ExecutorTorch's torch detection + ${TORCH_INCLUDE_DIRS} +) +target_compile_options(aoti_cuda PUBLIC -fexceptions -frtti -fPIC) +# Ensure symbols are exported properly +target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic) + +# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries +target_link_libraries( + aoti_cuda + PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS} + # Link PyTorch libraries for AOTI CUDA functions + ${TORCH_LIBRARIES} +) +# If you need other CUDA libraries, link them similarly: +# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...) +executorch_target_link_options_shared_lib(aoti_cuda) + +install( + TARGETS aoti_cuda + EXPORT ExecuTorchTargets + DESTINATION lib +) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp new file mode 100644 index 00000000000..08031ce6a26 --- /dev/null +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -0,0 +1,382 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// Include our shim layer headers +#include +#include +#include + +namespace executorch { +namespace backends { +namespace cuda { + +using namespace std; +using namespace aoti; + +using executorch::aten::ScalarType; +using executorch::runtime::ArrayRef; +using executorch::runtime::Backend; +using executorch::runtime::BackendExecutionContext; +using executorch::runtime::BackendInitContext; +using executorch::runtime::CompileSpec; +using executorch::runtime::DelegateHandle; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::NamedDataMap; +using executorch::runtime::Result; +using executorch::runtime::Span; +using executorch::runtime::etensor::Tensor; + +class ET_EXPERIMENTAL CudaBackend final + : public ::executorch::runtime::BackendInterface { + private: + Error register_shared_library_functions(void* so_handle) const { + AOTInductorModelContainerCreateWithDevice = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice")); + if (AOTInductorModelContainerCreateWithDevice == nullptr) { + ET_LOG(Error, "Failed to load AOTInductorModelContainerCreateWithDevice"); + return Error::AccessFailed; + } + + AOTInductorModelContainerDelete = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerDelete")); + if (AOTInductorModelContainerDelete == nullptr) { + ET_LOG(Error, "Failed to load AOTInductorModelContainerDelete"); + return Error::AccessFailed; + } + + AOTInductorModelContainerGetNumInputs = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetNumInputs")); + if (AOTInductorModelContainerGetNumInputs == nullptr) { + ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumInputs"); + return Error::AccessFailed; + } + + AOTInductorModelContainerGetNumOutputs = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs")); + if (AOTInductorModelContainerGetNumOutputs == nullptr) { + ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumOutputs"); + return Error::AccessFailed; + } + + AOTInductorModelContainerRun = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerRun")); + if (AOTInductorModelContainerRun == nullptr) { + ET_LOG(Error, "Failed to load AOTInductorModelContainerRun"); + return Error::AccessFailed; + } + + return Error::Ok; + } + + public: + bool is_available() const override { + return 1; + } + + // Once per loaded binary blob + Result init( + BackendInitContext& context, + FreeableBuffer* processed, // This will be a empty buffer + ArrayRef compile_specs // This will be my empty list + ) const override { + std::string method_name; + for (const CompileSpec& spec : compile_specs) { + if (std::strcmp(spec.key, "method_name") == 0) { + method_name.assign( + static_cast(spec.value.buffer), + spec.value.nbytes); // no nullptr guarantee, so pass size + break; + } + } + + std::string so_blob_key = + method_name.empty() ? "so_blob" : method_name + "_so_blob"; + + const NamedDataMap* named_data_map = context.get_named_data_map(); + auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str()); + if (!aoti_cuda_buffer.ok()) { + ET_LOG( + Error, + "Failed to get data for key %s: 0x%x", + so_blob_key.c_str(), + aoti_cuda_buffer.error()); + return aoti_cuda_buffer.error(); + } + // Generate dynamic temporary file path + filesystem::path temp_dir = filesystem::temp_directory_path(); + filesystem::path so_path = + temp_dir / (so_blob_key + to_string(getpid()) + ".so"); + + // Create a temporary file + ofstream outfile(so_path.c_str(), ios::binary); + + // Write the ELF buffer to the temporary file + ET_LOG( + Info, + "Writing %zu bytes to %s", + aoti_cuda_buffer->size(), + so_path.c_str()); + outfile.write( + static_cast(aoti_cuda_buffer->data()), + aoti_cuda_buffer->size()); + + if (!outfile) { + ET_LOG(Error, "Failed to write to file %s", so_path.c_str()); + return Error::AccessFailed; + } + // Finish writing the file to disk + outfile.close(); + + // Load the ELF using dlopen + void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL); + if (so_handle == nullptr) { + ET_LOG(Error, "Failed to load shared library: %s", dlerror()); + return Error::AccessFailed; + } + + processed->Free(); + + // Register all shared library functions + Error reg_err = register_shared_library_functions(so_handle); + if (reg_err != Error::Ok) { + return reg_err; + } + + AOTInductorModelContainerHandle container_handle = nullptr; + + AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice( + &container_handle, 1, "cuda", nullptr); + if (err != Error::Ok) { + return err; + } + ET_LOG(Info, "container_handle = %p", container_handle); + + AOTIDelegateHandle* handle = new AOTIDelegateHandle(); + handle->so_handle = so_handle; + handle->so_path = so_path.string(); + handle->container_handle = container_handle; + return (DelegateHandle*)handle; // Return the handle post-processing + } + + // Once per execution + Error execute( + BackendExecutionContext& context, + DelegateHandle* handle_, + Span args) const override { + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + size_t n_inputs; + AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs); + + size_t n_outputs; + AOTInductorModelContainerGetNumOutputs( + handle->container_handle, &n_outputs); + + if (n_inputs + n_outputs != args.size()) { + ET_LOG( + Error, + "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.", + n_inputs, + n_outputs, + args.size()); + return Error::InvalidArgument; + } + + // NOTE: ExecuTorch tensors are always on CPU/host memory + // We need to create GPU copies for CUDA kernel execution + std::vector gpu_inputs( + n_inputs); // GPU copies for kernel execution + std::vector gpu_outputs( + n_outputs); // GPU tensors for kernel output + + // Process input tensors: ExecuTorch provides CPU tensors, create GPU + // copies + for (int i = 0; i < n_inputs; i++) { + // Get tensor dimensions and properties from ExecuTorch CPU tensor + auto cpu_tensor = &(args[i]->toTensor()); + auto sizes = cpu_tensor->sizes(); + auto scalar_type = cpu_tensor->scalar_type(); + + // Create GPU tensor with same shape + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_input_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 1, // device_type = cuda + 0, // device_index = 0 + &gpu_input_handle); + + if (create_err != Error::Ok) { + ET_LOG(Error, "Failed to create GPU tensor for input %d", i); + return Error::Internal; + } + + gpu_inputs[i] = gpu_input_handle; + + // Copy data from CPU to GPU + Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0); + if (copy_err != Error::Ok) { + ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i); + return Error::Internal; + } + } + ET_LOG(Info, "Inputs copied to GPU"); + // Process output tensors: create GPU counterparts for ExecuTorch CPU + // tensors + for (int i = 0; i < n_outputs; i++) { + // Get output tensor dimensions from ExecuTorch CPU tensor + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + auto sizes = cpu_output_tensor->sizes(); + auto scalar_type = cpu_output_tensor->scalar_type(); + + // Create GPU tensor with same shape for kernel output + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_output_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 1, // device_type = cuda + 0, // device_index = 0 + &gpu_output_handle); + + if (create_err != Error::Ok) { + ET_LOG(Error, "Failed to create GPU tensor for output %d", i); + return Error::Internal; + } + + gpu_outputs[i] = gpu_output_handle; + } + ET_LOG(Info, "Outputs created on GPU"); + // Run AOTI container with GPU tensors + AOTIRuntimeError error = AOTInductorModelContainerRun( + handle->container_handle, + gpu_inputs.data(), // Use GPU input tensors + n_inputs, + gpu_outputs.data(), // Use GPU output tensors + n_outputs, + nullptr, // Pass the actual CUDA stream! + nullptr); // proxy_executor_handle can remain nullptr + + if (error != Error::Ok) { + ET_LOG( + Error, + "AOTInductorModelContainerRun failed with error code %d", + error); + return Error::Internal; + } + + // Copy GPU output results back to CPU output tensors + for (int i = 0; i < n_outputs; i++) { + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + // For DYNAMIC_BOUND tensors we try to resize + ET_CHECK_OK_OR_RETURN_ERROR( + resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()), + "Error resizing tensor at output index %d", + i); + ET_CHECK_OK_OR_RETURN_ERROR( + aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0), + "Failed to copy GPU output %d back to CPU", + i); + } + + // Clean up GPU tensors that we created (ExecuTorch tensors are always + // CPU, so all GPU tensors are our copies) + for (int i = 0; i < n_inputs; i++) { + // All GPU input tensors were created by us, delete them + aoti_torch_delete_tensor_object(gpu_inputs[i]); + } + + for (int i = 0; i < n_outputs; i++) { + // All GPU output tensors were created by us, delete them + aoti_torch_delete_tensor_object(gpu_outputs[i]); + } + + return Error::Ok; + } + + void destroy(DelegateHandle* handle_) const override { + if (handle_ == nullptr) { + return; + } + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + // Delete the container BEFORE closing the shared library + if (handle->container_handle != nullptr) { + AOTIRuntimeError delete_result = + AOTInductorModelContainerDelete(handle->container_handle); + if (delete_result != Error::Ok) { + ET_LOG( + Error, + "AOTInductorModelContainerDelete failed with error code %d", + delete_result); + } + handle->container_handle = nullptr; + } + + // Now close the shared library + if (handle->so_handle != nullptr) { + dlclose(handle->so_handle); + } + + // Remove the temporary shared library file + if (!handle->so_path.empty()) { + std::error_code remove_error; + std::filesystem::remove(handle->so_path, remove_error); + if (remove_error) { + ET_LOG( + Error, + "Failed to remove temporary shared library %s: %s", + handle->so_path.c_str(), + remove_error.message().c_str()); + } + } + + delete handle; + } +}; + +} // namespace cuda + +namespace { +auto cls = cuda::CudaBackend(); +executorch::runtime::Backend backend{"CudaBackend", &cls}; +static executorch::runtime::Error success_with_compiler = + register_backend(backend); +} // namespace + +} // namespace backends +} // namespace executorch diff --git a/backends/cuda/runtime/shims/utils.h b/backends/cuda/runtime/shims/utils.h index 99d2bc102f5..02c3abfc83f 100644 --- a/backends/cuda/runtime/shims/utils.h +++ b/backends/cuda/runtime/shims/utils.h @@ -40,6 +40,7 @@ namespace cuda { // Enum for supported data types in et-cuda backend enum class SupportedDTypes : int32_t { + INT64 = 4, // PyTorch's int64 dtype code FLOAT32 = 6, // PyTorch's float32 dtype code BFLOAT16 = 15, // PyTorch's bfloat16 dtype code }; @@ -100,6 +101,7 @@ using AOTITorchError = Error; // Helper function to check if a dtype is supported in ET CUDA backend inline bool is_dtype_supported_in_et_cuda(int32_t dtype) { switch (dtype) { + case static_cast(SupportedDTypes::INT64): case static_cast(SupportedDTypes::FLOAT32): case static_cast(SupportedDTypes::BFLOAT16): return true; @@ -113,8 +115,9 @@ inline AOTITorchError validate_dtype(int32_t dtype) { ET_CHECK_OR_RETURN_ERROR( is_dtype_supported_in_et_cuda(dtype), InvalidArgument, - "Unsupported dtype: %d. Supported dtypes: %d (float32), %d (bfloat16)", + "Unsupported dtype: %d. Supported dtypes: %d (int64), %d (float32), %d (bfloat16)", dtype, + static_cast(SupportedDTypes::INT64), static_cast(SupportedDTypes::FLOAT32), static_cast(SupportedDTypes::BFLOAT16)); diff --git a/examples/cuda/scripts/__init__.py b/examples/cuda/scripts/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/examples/cuda/scripts/export.py b/examples/cuda/scripts/export.py new file mode 100644 index 00000000000..c103d7ee50a --- /dev/null +++ b/examples/cuda/scripts/export.py @@ -0,0 +1,116 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Example script for exporting simple models to flatbuffer with CUDA delegate. + +import argparse +import pathlib + +import torch + +from executorch.backends.cuda.cuda_backend import CudaBackend + +from executorch.backends.cuda.cuda_partitioner import CudaPartitioner + +from executorch.examples.models import MODEL_NAME_TO_MODEL +from executorch.examples.models.model_factory import EagerModelFactory + +from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower + +from executorch.extension.export_util.utils import save_pte_program +from torch._inductor.decomposition import conv1d_to_conv2d +from torch.nn.attention import SDPBackend + +# Script to export a model with CUDA delegation. + +_EDGE_COMPILE_CONFIG = EdgeCompileConfig( + _check_ir_validity=False, + _skip_dim_order=True, # TODO(T182928844): enable dim_order in backend +) + + +def is_fbcode(): + return not hasattr(torch.version, "git_version") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + + parser.add_argument( + "-m", + "--model_name", + required=True, + help=f"Provide model name. Valid ones: {list(MODEL_NAME_TO_MODEL.keys())}", + ) + parser.add_argument( + "--output_dir", + type=pathlib.Path, + default=pathlib.Path("./"), + help="Output directory for the exported model", + ) + parser.add_argument("--generate_etrecord", action=argparse.BooleanOptionalAction) + parser.add_argument("--save_processed_bytes", action=argparse.BooleanOptionalAction) + + args = parser.parse_args() + return args + + +def save_processed_bytes(processed_bytes, base_name: str): + filename = f"{base_name}.bin" + print(f"Saving processed bytes to {filename}") + with open(filename, "wb") as file: + file.write(processed_bytes) + return + + +def main(): + args = parse_args() + + if args.model_name not in MODEL_NAME_TO_MODEL: + raise RuntimeError( + f"Model {args.model_name} is not a valid name. " + f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}." + ) + + ( + model, + example_args, + example_kwargs, + dynamic_shapes, + ) = EagerModelFactory.create_model(*MODEL_NAME_TO_MODEL[args.model_name]) + model = model.eval() + exported_programs = torch.export.export( + model, + args=example_args, + kwargs=example_kwargs, + dynamic_shapes=dynamic_shapes, + ) + print(exported_programs) + + partitioner = CudaPartitioner( + [CudaBackend.generate_method_name_compile_spec(args.model_name)] + ) + # Add decompositions for triton to generate kernels. + exported_programs = exported_programs.run_decompositions( + { + torch.ops.aten.conv1d.default: conv1d_to_conv2d, + } + ) + with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]): + et_prog = to_edge_transform_and_lower( + exported_programs, + partitioner=[partitioner], + compile_config=_EDGE_COMPILE_CONFIG, + generate_etrecord=args.generate_etrecord, + ) + exec_program = et_prog.to_executorch() + save_pte_program(exec_program, args.model_name, args.output_dir) + if args.generate_etrecord: + exec_program.get_etrecord().save(f"{args.model_name}_etrecord.bin") + + +if __name__ == "__main__": + main() diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh index cfe691c7bd4..6df4caf8692 100755 --- a/examples/models/moshi/mimi/install_requirements.sh +++ b/examples/models/moshi/mimi/install_requirements.sh @@ -8,7 +8,7 @@ set -x conda install -c conda-forge "ffmpeg<8" -y -pip install torchcodec==0.7.0.dev20250906 --extra-index-url https://download.pytorch.org/whl/nightly/cpu +pip install torchcodec==0.7.0.dev20250929 --extra-index-url https://download.pytorch.org/whl/nightly/cpu pip install moshi==0.2.4 pip install bitsandbytes soundfile # Run llama2/install requirements for torchao deps diff --git a/examples/models/moshi/mimi/test_mimi.py b/examples/models/moshi/mimi/test_mimi.py index be3c075913d..d0c3c2ceb15 100644 --- a/examples/models/moshi/mimi/test_mimi.py +++ b/examples/models/moshi/mimi/test_mimi.py @@ -156,7 +156,7 @@ def test_streaming_encoding_decoding(self): all_pcms_streaming = torch.cat(all_pcms_streaming, dim=-1) sqnr_streaming = compute_sqnr(pcm_ref, all_pcms_streaming) print(f"sqnr_streaming = {sqnr_streaming} dB") - self.assertTrue(sqnr_streaming > 100) + self.assertTrue(sqnr_streaming > 70) def test_exported_encoding(self): """Ensure exported encoding model is consistent with reference output.""" diff --git a/src/executorch/examples/cuda b/src/executorch/examples/cuda new file mode 120000 index 00000000000..aa2e50dd2cc --- /dev/null +++ b/src/executorch/examples/cuda @@ -0,0 +1 @@ +../../../examples/cuda \ No newline at end of file diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake index 0039ab551fb..bf5eaaef107 100644 --- a/tools/cmake/preset/default.cmake +++ b/tools/cmake/preset/default.cmake @@ -145,6 +145,9 @@ define_overridable_option( define_overridable_option( EXECUTORCH_BUILD_CORTEX_M "Build the Cortex-M backend" BOOL OFF ) +define_overridable_option( + EXECUTORCH_BUILD_CUDA "Build the CUDA backend" BOOL OFF +) define_overridable_option( EXECUTORCH_BUILD_VGF "Build the Arm VGF backend" BOOL OFF ) @@ -342,6 +345,10 @@ check_required_options_on( EXECUTORCH_BUILD_EXTENSION_LLM ) +check_required_options_on( + IF_ON EXECUTORCH_BUILD_CUDA REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR +) + if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH}) message( FATAL_ERROR diff --git a/torch_pin.py b/torch_pin.py index 1b89309ad05..02040c91963 100644 --- a/torch_pin.py +++ b/torch_pin.py @@ -1,2 +1,2 @@ TORCH_VERSION = "2.10.0" -NIGHTLY_VERSION = "dev20250915" +NIGHTLY_VERSION = "dev20251003"