From 97adfcb1afd75d0f5bd1cddd316fa85140aeb301 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Fri, 3 Oct 2025 11:05:54 -0700 Subject: [PATCH 1/3] replace 90a-virtual by 90-virtual for forward compatible --- .../llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl | 3 ++- onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc | 2 ++ .../azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml | 4 ++-- .../github/azure-pipelines/stages/py-gpu-packaging-stage.yml | 2 +- tools/ci_build/github/linux/build_cuda_c_api_package.sh | 2 +- tools/ci_build/github/linux/build_linux_python_package.sh | 2 +- tools/ci_build/github/linux/build_nodejs_package.sh | 2 +- tools/ci_build/github/linux/build_tensorrt_c_api_package.sh | 2 +- 8 files changed, 11 insertions(+), 8 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl index 4f901a550e8bf..588f37051b534 100644 --- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl +++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl @@ -60,7 +60,7 @@ namespace cutlass_kernels { template -#ifdef COMPILE_HOPPER_TMA_GEMMS +#if defined(COMPILE_HOPPER_TMA_GEMMS) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 900) && defined(__NV_SASS_VERSION__) void sm90_generic_mixed_gemm_kernelLauncher( ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales, ScaleZeroType const* weight_zero_points, BiasType const* biases, @@ -269,6 +269,7 @@ void sm90_generic_mixed_gemm_kernelLauncher( } } #else // COMPILE_HOPPER_TMA_GEMMS +// This stub is now used for ALL non-SASS or non-SM90A compilation passes includes the 90-virtual (PTX) pass. void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const*, WeightType const*, ScaleZeroType const*, ScaleZeroType const*, BiasType const*, float const, OutputType*, int, int, int, int const, tkc::CutlassGemmConfig, diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc index 925a6913a2890..e5b15856a6c05 100644 --- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc +++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#if USE_FPA_INTB_GEMM #include "contrib_ops/cuda/llm/fpA_intB_gemm_profiler.h" #include "contrib_ops/cuda/llm/common/workspace.h" @@ -97,3 +98,4 @@ bool WeightOnlyGroupwiseQuantGemmPluginProfiler::checkTactic(int m, int /*n*/, i } } // namespace onnxruntime::llm::kernels::weight_only +#endif diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml index 61afeba2d302b..e7e541205ba0a 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml @@ -60,7 +60,7 @@ stages: msbuildPlatform: x64 packageName: x64-cuda CudaVersion: ${{ parameters.CudaVersion }} - buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90a-virtual" + buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90-virtual" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu @@ -80,7 +80,7 @@ stages: msbuildPlatform: x64 CudaVersion: ${{ parameters.CudaVersion }} packageName: x64-tensorrt - buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90a-virtual" + buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90-virtual" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml index d8bb51b5ef79d..d3d4b8f5b64d5 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml @@ -38,7 +38,7 @@ stages: PYTHON_VERSION: ${{ python_version }} EP_NAME: gpu CudaVersion: ${{ parameters.cuda_version }} - EP_BUILD_FLAGS: --enable_lto --use_cuda --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52-real;61-real;75-real;86-real;89-real;90a-virtual" + EP_BUILD_FLAGS: --enable_lto --use_cuda --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52-real;61-real;75-real;86-real;89-real;90-virtual" use_tensorrt: True - template: py-linux-gpu-stage.yml diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh index 9cc140f41cf91..2f3ac991aee9c 100755 --- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh +++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh @@ -2,4 +2,4 @@ set -e -x docker run -e SYSTEM_COLLECTIONURI --rm --volume \ $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \ -/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' && cd /build/Release && make install DESTDIR=/build/installed" +/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' && cd /build/Release && make install DESTDIR=/build/installed" diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh index 65be0c7b60ead..62bf8b4a245bb 100755 --- a/tools/ci_build/github/linux/build_linux_python_package.sh +++ b/tools/ci_build/github/linux/build_linux_python_package.sh @@ -69,7 +69,7 @@ fi if [ "$BUILD_DEVICE" == "GPU" ]; then SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed 's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/') #Enable CUDA and TRT EPs. - BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;86-real;90a-real;90a-virtual" "onnxruntime_USE_FPA_INTB_GEMM=OFF") + BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;86-real;90a-real;90-virtual" "onnxruntime_USE_FPA_INTB_GEMM=OFF") fi if [ "$BUILD_DEVICE" == "NPU" ]; then diff --git a/tools/ci_build/github/linux/build_nodejs_package.sh b/tools/ci_build/github/linux/build_nodejs_package.sh index cc6443cc7fab6..ff5c504376d1d 100755 --- a/tools/ci_build/github/linux/build_nodejs_package.sh +++ b/tools/ci_build/github/linux/build_nodejs_package.sh @@ -3,4 +3,4 @@ set -e -x mkdir -p $HOME/.onnx docker run -e SYSTEM_COLLECTIONURI --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \ --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \ -/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_nodejs --use_webgpu --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed" +/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_nodejs --use_webgpu --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed" diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh index b8d968c82d002..c0849bf0ace73 100755 --- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh +++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh @@ -3,4 +3,4 @@ set -e -x mkdir -p $HOME/.onnx docker run -e SYSTEM_COLLECTIONURI --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \ --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \ -/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed" +/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed" From 4675385f9a4edcdc5e3f265090095aeb8c59ff63 Mon Sep 17 00:00:00 2001 From: Tianlei WU Date: Mon, 6 Oct 2025 10:45:10 -0700 Subject: [PATCH 2/3] Fix test --- .../test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc b/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc index 3e339d86c7943..1652d16f5cb66 100644 --- a/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc +++ b/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc @@ -3,7 +3,7 @@ // Test can be run like the following: // ./onnxruntime_provider_test --gtest_filter=CUDA_EP_Unittest.* - +#if USE_FPA_INTB_GEMM #include #include #include @@ -620,3 +620,4 @@ TEST_F(Bf16Int4GroupwiseTest, BF16_Int4_Gemm_CudaKernel) { } } } +#endif From 8e644e7c2e1dfbeda09020792be1be3f53bad60b Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 6 Oct 2025 20:30:24 -0700 Subject: [PATCH 3/3] disable FPA INTB GEMM --- cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 793207f5b6d76..116d369885a27 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -101,7 +101,7 @@ option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF) cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF) option(onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" OFF) cmake_dependent_option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF) -cmake_dependent_option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" ON "onnxruntime_USE_CUDA" OFF) +option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" OFF) option(onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF) option(onnxruntime_USE_AVX "Use AVX instructions" OFF)