Skip to content

Commit 8dd32fe

Browse files
authored
[TensorRT EP] avoid excessive library load/unload overhead when running unit tests. (#15639)
TensorRT will load/unload libraries as builder objects are created and torn down. This will happen for every single unit test, which leads to excessive test execution time due to that overhead. This overhead has steadily increased over the past few TensorRT versions as the library objects get bigger leading to 8 hours to run all the unit tests. Nvidia suggests to keep a placeholder builder object around to avoid this.
1 parent c2acf69 commit 8dd32fe

File tree

8 files changed

+36
-37
lines changed

8 files changed

+36
-37
lines changed

cmake/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ option(onnxruntime_ENABLE_MICROSOFT_INTERNAL "Use this option to enable/disable
8989
option(onnxruntime_USE_VITISAI "Build with Vitis-AI" OFF)
9090
option(onnxruntime_USE_TENSORRT "Build with TensorRT support" OFF)
9191
option(onnxruntime_USE_TENSORRT_BUILTIN_PARSER "Use TensorRT builtin parser" OFF)
92-
option(onnxruntime_TENSORRT_PLACEHOLDER_BUILDER "Instantiate Placeholder TensorRT Builder" OFF)
9392
option(onnxruntime_ENABLE_LTO "Enable link time optimization" OFF)
9493
option(onnxruntime_CROSS_COMPILING "Cross compiling onnx runtime" OFF)
9594
option(onnxruntime_GCOV_COVERAGE "Compile with options necessary to run code coverage" OFF)

cmake/onnxruntime_unittests.cmake

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ function(AddTest)
6969
target_include_directories(${_UT_TARGET} PRIVATE ${NCCL_INCLUDE_DIRS})
7070
endif()
7171
endif()
72+
if (onnxruntime_USE_TENSORRT)
73+
# used for instantiating placeholder TRT builder to mitigate TRT library load/unload overhead
74+
target_include_directories(${_UT_TARGET} PRIVATE ${TENSORRT_INCLUDE_DIR})
75+
endif()
7276

7377
if(MSVC)
7478
target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
@@ -583,6 +587,7 @@ if(onnxruntime_USE_TENSORRT)
583587
list(APPEND onnxruntime_test_framework_src_patterns "${ONNXRUNTIME_ROOT}/core/providers/tensorrt/tensorrt_execution_provider_utils.h")
584588
list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_tensorrt)
585589
list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_tensorrt onnxruntime_providers_shared)
590+
list(APPEND onnxruntime_test_providers_libs ${TENSORRT_LIBRARY_INFER})
586591
endif()
587592

588593
if(onnxruntime_USE_MIGRAPHX)
@@ -770,21 +775,13 @@ endif()
770775

771776
set(test_all_args)
772777
if (onnxruntime_USE_TENSORRT)
773-
if (onnxruntime_SKIP_AND_PERFORM_FILTERED_TENSORRT_TESTS)
774-
# TRT EP package pipelines takes much longer time to run tests with TRT 8.5. We can't use placeholder to reduce testing time due to application test deadlock.
775-
# Therefore we only run filtered TRT EP tests.
776-
list(APPEND test_all_args "--gtest_filter=*tensorrt_*:*TensorrtExecutionProviderTest*" )
777-
#list(APPEND test_all_args "--gtest_filter=-*cpu_*:*cuda_*:*ContribOpTest*:*QuantGemmTest*:*QLinearConvTest*:*MurmurHash3OpTest*:*PadOpTest*:*QLinearConvTest*" )
778-
else()
779-
# TRT EP CI takes much longer time when updating to TRT 8.2
780-
# So, we only run trt ep and exclude other eps to reduce CI test time.
781-
#
782-
# The test names of model tests were using sequential number in the past.
783-
# This PR https://github.com/microsoft/onnxruntime/pull/10220 (Please see ExpandModelName function in model_tests.cc for more details)
784-
# made test name contain the "ep" and "model path" information, so we can easily filter the tests using cuda ep or other ep with *cpu_* or *xxx_*.
785-
list(APPEND test_all_args "--gtest_filter=-*cpu_*:*cuda_*" )
786-
endif()
787-
778+
# TRT EP CI takes much longer time when updating to TRT 8.2
779+
# So, we only run trt ep and exclude other eps to reduce CI test time.
780+
#
781+
# The test names of model tests were using sequential number in the past.
782+
# This PR https://github.com/microsoft/onnxruntime/pull/10220 (Please see ExpandModelName function in model_tests.cc for more details)
783+
# made test name contain the "ep" and "model path" information, so we can easily filter the tests using cuda ep or other ep with *cpu_* or *xxx_*.
784+
list(APPEND test_all_args "--gtest_filter=-*cpu_*:*cuda_*" )
788785
endif ()
789786

790787
AddTest(
@@ -1202,6 +1199,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
12021199
if (onnxruntime_USE_CUDA)
12031200
list(APPEND onnxruntime_shared_lib_test_LIBS onnxruntime_test_cuda_ops_lib cudart)
12041201
endif()
1202+
if (onnxruntime_USE_TENSORRT)
1203+
list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER})
1204+
endif()
12051205
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
12061206
list(APPEND onnxruntime_shared_lib_test_LIBS ${android_shared_libs})
12071207
endif()
@@ -1465,6 +1465,9 @@ if (NOT onnxruntime_BUILD_WEBASSEMBLY)
14651465
${ONNXRUNTIME_CUSTOM_OP_REGISTRATION_TEST_SRC_DIR}/test_registercustomops.cc)
14661466

14671467
set(onnxruntime_customopregistration_test_LIBS custom_op_library onnxruntime_common onnxruntime_test_utils)
1468+
if (onnxruntime_USE_TENSORRT)
1469+
list(APPEND onnxruntime_customopregistration_test_LIBS ${TENSORRT_LIBRARY_INFER})
1470+
endif()
14681471
AddTest(DYN
14691472
TARGET onnxruntime_customopregistration_test
14701473
SOURCES ${onnxruntime_customopregistration_test_SRC} ${onnxruntime_unittest_main_src}

onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -363,12 +363,6 @@ std::unique_lock<OrtMutex> TensorrtExecutionProvider::GetApiLock() const {
363363
return std::unique_lock<OrtMutex>(singleton);
364364
}
365365

366-
#ifdef ORT_TENSORRT_PLACEHOLDER_BUILDER
367-
// instantiate global unused builder object which keeps the TRT kernel library in memory
368-
// so that subsequent builders avoid the expensive load / unload process.
369-
auto const placeholder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(GetTensorrtLogger()));
370-
#endif
371-
372366
TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProviderInfo& info)
373367
: IExecutionProvider{onnxruntime::kTensorrtExecutionProvider, true}, info_(info), device_id_(info.device_id) {
374368
InitProviderOrtApi();

onnxruntime/test/unittest_main/test_main.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,20 @@ void ortenv_setup() {
2525
ort_env.reset(new Ort::Env(&tpo, ORT_LOGGING_LEVEL_WARNING, "Default"));
2626
}
2727

28+
#ifdef USE_TENSORRT
29+
// TensorRT will load/unload libraries as builder objects are created and torn down. This will happen for
30+
// every single unit test, which leads to excessive test execution time due to that overhead.
31+
// Nvidia suggests to keep a placeholder builder object around to avoid this.
32+
#include "NvInfer.h"
33+
class DummyLogger : public nvinfer1::ILogger {
34+
public:
35+
DummyLogger(Severity verbosity) {}
36+
void log(Severity severity, const char* msg) noexcept override {}
37+
};
38+
DummyLogger trt_logger(nvinfer1::ILogger::Severity::kWARNING);
39+
auto const placeholder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(trt_logger));
40+
#endif
41+
2842
#define TEST_MAIN main
2943

3044
#if defined(__APPLE__)

tools/ci_build/build.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -502,9 +502,6 @@ def convert_arg_line_to_args(self, arg_line):
502502
"--use_tensorrt_builtin_parser", action="store_true", default=True, help="Use TensorRT builtin parser"
503503
)
504504
parser.add_argument("--use_tensorrt_oss_parser", action="store_true", help="Use TensorRT OSS parser")
505-
parser.add_argument(
506-
"--tensorrt_placeholder_builder", action="store_true", help="Instantiate Placeholder TensorRT Builder"
507-
)
508505
parser.add_argument("--tensorrt_home", help="Path to TensorRT installation dir")
509506
parser.add_argument("--test_all_timeout", default="10800", help="Set timeout for onnxruntime_test_all")
510507
parser.add_argument("--use_migraphx", action="store_true", help="Build with MIGraphX")
@@ -911,11 +908,8 @@ def generate_build_tree(
911908
"-Donnxruntime_ENABLE_MICROSOFT_INTERNAL=" + ("ON" if args.enable_msinternal else "OFF"),
912909
"-Donnxruntime_USE_VITISAI=" + ("ON" if args.use_vitisai else "OFF"),
913910
"-Donnxruntime_USE_TENSORRT=" + ("ON" if args.use_tensorrt else "OFF"),
914-
"-Donnxruntime_SKIP_AND_PERFORM_FILTERED_TENSORRT_TESTS="
915-
+ ("ON" if not args.tensorrt_placeholder_builder else "OFF"),
916911
"-Donnxruntime_USE_TENSORRT_BUILTIN_PARSER="
917912
+ ("ON" if args.use_tensorrt_builtin_parser and not args.use_tensorrt_oss_parser else "OFF"),
918-
"-Donnxruntime_TENSORRT_PLACEHOLDER_BUILDER=" + ("ON" if args.tensorrt_placeholder_builder else "OFF"),
919913
# set vars for TVM
920914
"-Donnxruntime_USE_TVM=" + ("ON" if args.use_tvm else "OFF"),
921915
"-Donnxruntime_TVM_CUDA_RUNTIME=" + ("ON" if args.use_tvm and args.tvm_cuda_runtime else "OFF"),
@@ -1749,11 +1743,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
17491743
run_subprocess(ctest_cmd, cwd=cwd, dll_path=dll_path)
17501744

17511745
if args.enable_pybind:
1752-
# Disable python tests for TensorRT on Windows due to need to enable placeholder builder
1753-
# to reduce test times.
1754-
if args.use_tensorrt and is_windows():
1755-
return
1756-
17571746
python_path = None
17581747
if args.use_tvm:
17591748
python_path = str((Path(build_dir) / config / "_deps" / "tvm-src" / "python").resolve())

tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ jobs:
4949
--build_wheel \
5050
--enable_onnx_tests --use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
5151
--enable_pybind --build_java \
52-
--use_tensorrt --use_tensorrt_builtin_parser --tensorrt_placeholder_builder --tensorrt_home /usr \
52+
--use_tensorrt --tensorrt_home /usr \
5353
--cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc CMAKE_CUDA_ARCHITECTURES=75
5454
workingDirectory: $(Build.SourcesDirectory)
5555

tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ stages:
4646
docker run --gpus all -e CC=/opt/rh/devtoolset-11/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-11/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \
4747
--volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \
4848
/opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release \
49-
--skip_submodule_sync --parallel --build_shared_lib ${{ parameters.buildJavaOption }} --use_tensorrt --use_tensorrt_builtin_parser --tensorrt_placeholder_builder --cuda_version=$(CUDA_VERSION) --cuda_home=/usr/local/cuda-$(CUDA_VERSION) --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80'
49+
--skip_submodule_sync --parallel --build_shared_lib ${{ parameters.buildJavaOption }} --use_tensorrt --cuda_version=$(CUDA_VERSION) --cuda_home=/usr/local/cuda-$(CUDA_VERSION) --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80'
5050
workingDirectory: $(Build.SourcesDirectory)
5151

5252
- ${{ if eq(parameters.buildJava, true) }}:

tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ jobs:
5454
displayName: 'Generate cmake config'
5555
inputs:
5656
scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
57-
arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 16 2019" --build_wheel --enable_onnx_tests --use_tensorrt --use_tensorrt_builtin_parser --tensorrt_placeholder_builder --tensorrt_home="C:\local\TensorRT-8.6.0.12.Windows10.x86_64.cuda-11.8" --cuda_version=11.6 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75'
57+
arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 16 2019" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.0.12.Windows10.x86_64.cuda-11.8" --cuda_version=11.6 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75'
5858
workingDirectory: '$(Build.BinariesDirectory)'
5959

6060
- task: VSBuild@1
@@ -84,7 +84,7 @@ jobs:
8484
del wheel_filename_file
8585
python.exe -m pip install -q --upgrade %WHEEL_FILENAME%
8686
set PATH=$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig);%PATH%
87-
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 16 2019" --build_wheel --enable_onnx_tests --use_tensorrt --use_tensorrt_builtin_parser --tensorrt_placeholder_builder --tensorrt_home="C:\local\TensorRT-8.6.0.12.Windows10.x86_64.cuda-11.8" --cuda_version=11.6 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
87+
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 16 2019" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.0.12.Windows10.x86_64.cuda-11.8" --cuda_version=11.6 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
8888
8989
workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
9090
displayName: 'Run tests'

0 commit comments

Comments
 (0)