Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF)
cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
option(onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" OFF)
cmake_dependent_option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
cmake_dependent_option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" ON "onnxruntime_USE_CUDA" OFF)
option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" OFF)

option(onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF)
option(onnxruntime_USE_AVX "Use AVX instructions" OFF)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ namespace cutlass_kernels {
template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType, typename OutputType,
cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag, typename CTAShape, typename ClusterShape,
typename MainloopScheduleType, typename EpilogueScheduleType>
#ifdef COMPILE_HOPPER_TMA_GEMMS
#if defined(COMPILE_HOPPER_TMA_GEMMS) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 900) && defined(__NV_SASS_VERSION__)
void sm90_generic_mixed_gemm_kernelLauncher(
ActivationType const* A, WeightType const* B,
ScaleZeroType const* weight_scales, ScaleZeroType const* weight_zero_points, BiasType const* biases,
Expand Down Expand Up @@ -269,6 +269,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(
}
}
#else // COMPILE_HOPPER_TMA_GEMMS
// This stub is now used for ALL non-SASS or non-SM90A compilation passes includes the 90-virtual (PTX) pass.
void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const*, WeightType const*,
ScaleZeroType const*, ScaleZeroType const*, BiasType const*,
float const, OutputType*, int, int, int, int const, tkc::CutlassGemmConfig,
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#if USE_FPA_INTB_GEMM
#include "contrib_ops/cuda/llm/fpA_intB_gemm_profiler.h"
#include "contrib_ops/cuda/llm/common/workspace.h"

Expand Down Expand Up @@ -97,3 +98,4 @@ bool WeightOnlyGroupwiseQuantGemmPluginProfiler::checkTactic(int m, int /*n*/, i
}

} // namespace onnxruntime::llm::kernels::weight_only
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

// Test can be run like the following:
// ./onnxruntime_provider_test --gtest_filter=CUDA_EP_Unittest.*

#if USE_FPA_INTB_GEMM
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <gtest/gtest.h>
Expand Down Expand Up @@ -620,3 +620,4 @@ TEST_F(Bf16Int4GroupwiseTest, BF16_Int4_Gemm_CudaKernel) {
}
}
}
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ stages:
msbuildPlatform: x64
packageName: x64-cuda
CudaVersion: ${{ parameters.CudaVersion }}
buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90a-virtual"
buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90-virtual"
runTests: ${{ parameters.RunOnnxRuntimeTests }}
buildJava: ${{ parameters.buildJava }}
java_artifact_id: onnxruntime_gpu
Expand All @@ -80,7 +80,7 @@ stages:
msbuildPlatform: x64
CudaVersion: ${{ parameters.CudaVersion }}
packageName: x64-tensorrt
buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90a-virtual"
buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90-virtual"
runTests: ${{ parameters.RunOnnxRuntimeTests }}
buildJava: ${{ parameters.buildJava }}
java_artifact_id: onnxruntime_gpu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ stages:
PYTHON_VERSION: ${{ python_version }}
EP_NAME: gpu
CudaVersion: ${{ parameters.cuda_version }}
EP_BUILD_FLAGS: --enable_lto --use_cuda --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52-real;61-real;75-real;86-real;89-real;90a-virtual"
EP_BUILD_FLAGS: --enable_lto --use_cuda --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52-real;61-real;75-real;86-real;89-real;90-virtual"
use_tensorrt: True

- template: py-linux-gpu-stage.yml
Expand Down
2 changes: 1 addition & 1 deletion tools/ci_build/github/linux/build_cuda_c_api_package.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
set -e -x
docker run -e SYSTEM_COLLECTIONURI --rm --volume \
$BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \

Check warning on line 4 in tools/ci_build/github/linux/build_cuda_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_cuda_c_api_package.sh:4:115: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 4 in tools/ci_build/github/linux/build_cuda_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_cuda_c_api_package.sh:4:51: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 4 in tools/ci_build/github/linux/build_cuda_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_cuda_c_api_package.sh:4:1: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 4 in tools/ci_build/github/linux/build_cuda_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_cuda_c_api_package.sh:4:115: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 4 in tools/ci_build/github/linux/build_cuda_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_cuda_c_api_package.sh:4:51: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 4 in tools/ci_build/github/linux/build_cuda_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_cuda_c_api_package.sh:4:1: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)
/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' && cd /build/Release && make install DESTDIR=/build/installed"
/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' && cd /build/Release && make install DESTDIR=/build/installed"
2 changes: 1 addition & 1 deletion tools/ci_build/github/linux/build_linux_python_package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ fi
if [ "$BUILD_DEVICE" == "GPU" ]; then
SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed 's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/')
#Enable CUDA and TRT EPs.
BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;86-real;90a-real;90a-virtual" "onnxruntime_USE_FPA_INTB_GEMM=OFF")
BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;86-real;90a-real;90-virtual" "onnxruntime_USE_FPA_INTB_GEMM=OFF")
fi

if [ "$BUILD_DEVICE" == "NPU" ]; then
Expand Down
2 changes: 1 addition & 1 deletion tools/ci_build/github/linux/build_nodejs_package.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
set -e -x
mkdir -p $HOME/.onnx

Check warning on line 3 in tools/ci_build/github/linux/build_nodejs_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_nodejs_package.sh:3:10: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 3 in tools/ci_build/github/linux/build_nodejs_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_nodejs_package.sh:3:10: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)
docker run -e SYSTEM_COLLECTIONURI --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \

Check warning on line 4 in tools/ci_build/github/linux/build_nodejs_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_nodejs_package.sh:4:84: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 4 in tools/ci_build/github/linux/build_nodejs_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_nodejs_package.sh:4:84: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)
--volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \
/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_nodejs --use_webgpu --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed"
/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_nodejs --use_webgpu --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed"
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
set -e -x
mkdir -p $HOME/.onnx

Check warning on line 3 in tools/ci_build/github/linux/build_tensorrt_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_tensorrt_c_api_package.sh:3:10: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 3 in tools/ci_build/github/linux/build_tensorrt_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_tensorrt_c_api_package.sh:3:10: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)
docker run -e SYSTEM_COLLECTIONURI --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \

Check warning on line 4 in tools/ci_build/github/linux/build_tensorrt_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_tensorrt_c_api_package.sh:4:134: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 4 in tools/ci_build/github/linux/build_tensorrt_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_tensorrt_c_api_package.sh:4:84: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 4 in tools/ci_build/github/linux/build_tensorrt_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_tensorrt_c_api_package.sh:4:134: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 4 in tools/ci_build/github/linux/build_tensorrt_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_tensorrt_c_api_package.sh:4:84: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)
--volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \

Check warning on line 5 in tools/ci_build/github/linux/build_tensorrt_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_tensorrt_c_api_package.sh:5:120: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 5 in tools/ci_build/github/linux/build_tensorrt_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_tensorrt_c_api_package.sh:5:49: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 5 in tools/ci_build/github/linux/build_tensorrt_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_tensorrt_c_api_package.sh:5:120: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)

Check warning on line 5 in tools/ci_build/github/linux/build_tensorrt_c_api_package.sh

View workflow job for this annotation

GitHub Actions / Optional Lint

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. Raw Output: ./tools/ci_build/github/linux/build_tensorrt_c_api_package.sh:5:49: info: Double quote to prevent globbing and word splitting. (ShellCheck.SC2086)
/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed"
/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed"
Loading