Skip to content

Commit 0c0e20d

Browse files
authored
Plugin TensorRT EP using ORT EP ABI (#527)
* plugin TRT EP init * clean up GetCapabilityImpl and make it pass compiler for now * Clean up CompileImpl * update ep factory * update ep factory * update ep factory * clean up and add back onnx_ctx_model_helper.cc * clean up * remove onnxruntime namespace * update * Add TRTEpNodeComputeInfo * add allocator and data transfer * fix a lot of compile errors * call EpDevice_AddAllocatorInfo in GetSupportedDevicesImpl * temporary way to get provider option without proper API * Clean up cmake file to remove dependencies that built with ORT * Update CompileImpl * add ort_graph_to_proto.h and leverage OrtGraphToProto utilities * update EP context model helper * Convert onnxruntime::Status to OrtStatus * remove unused files * use GetSessionOptionsConfigEntries to get provider options * fix a bunch of compile errors * update memory info and data transfer in TRT EP's factor to accommodate mutiple GPU devices * update cuda/pinned allocator to make compiler happy * add GetVersionImpl in factory * update data transfer initialization in TRT EP * Fix compile errors/issues * fix to use correct API * fix bug for gpu data transfer implementation * clean up * remove unnecessary files * Temporarily manually creates cudaStream to run * Temporary make plugin TRT links against the protobuf, onnx, flatbuffers built from ORT repo * fix the issue of error LNK2038: mismatch detected for 'RuntimeLibrary' in CMake for Windows debug build * refactor memory info stored in factory * update as onnxruntime_ep_c_api.h changes * Add support for dump and run EP Context model * update and sync with latest ep c api * remove delete resource in TRTEpDataTransfer::ReleaseImpl * update cmake file to force dynamic release CRT globally for all dependencies if it's release build * use updated Value_GetMemoryDevice API * update ort to graph util * Add EP API Stream support * Update CMakeLists.txt * fix mem leak for OrtAllocator * add missing header file * fix build issue on Linux * lintrunner -a * Update to use new API OpAttr_GetTensorAttributeAsOrtValue * remove unnecessary files * Add default logger for TRT logger * Add default logger for TRT EP * update include path in utility function header * Add default logger for TRT EP (cont.) * put code under namespace trt_ep * remove unnecessary files * update GetCapabilityImpl() * Add code for updating cache path for EPContext node * add onnx_external_data_bytestream support for refitting the engine * address reviewer's comments * Add try/catch for c++ API that throws Ort::Exception * Set node_fusion_options.drop_constant_initializers to true for node_fusion_options * remove unused code * add missing trt_ep namespace * remove the remaining commented code * address reviewer's comments * address reviewer's comments
1 parent f271c1e commit 0c0e20d

33 files changed

+8956
-0
lines changed
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# usage:
2+
# cd build/
3+
# cmake -S ../ -B ./ -DCMAKE_BUILD_TYPE=Debug -DORT_HOME=/path/to/ort_package/onnxruntime-win-x64-gpu-1.23.0 -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DTENSORRT_HOME=/path/to/tensorrt/TensorRT-10.3.0.26 -DCMAKE_POSITION_INDEPENDENT_CODE=ON (see the result of "nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits")
4+
# cmake --build ./ --config Debug
5+
cmake_minimum_required(VERSION 3.26)
6+
project(TensorRTEp VERSION 1.0)
7+
set(CMAKE_CXX_STANDARD 17)
8+
9+
enable_language(CUDA) # via nvcc to get the CUDA tool kit
10+
file(TO_CMAKE_PATH "/usr/local/cuda" CUDAToolkit_ROOT)
11+
find_package(CUDAToolkit REQUIRED)
12+
13+
# CMake config to force dynamic debug CRT or dynamic release CRT globally for all dependencies.
14+
# This is to address the issue of:
15+
# libprotobufd.lib(common.obj) : error LNK2038: mismatch detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value 'MDd_DynamicDebug' in unary_elementwise_ops_impl.obj
16+
if (WIN32)
17+
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
18+
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL" CACHE STRING "" FORCE) # /MDd
19+
set(BUILD_SHARED_LIBS OFF) # Build protobuf as static .lib, but using dynamic runtime
20+
endif()
21+
22+
if(CMAKE_BUILD_TYPE STREQUAL "Release")
23+
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDLL" CACHE STRING "" FORCE)
24+
set(BUILD_SHARED_LIBS OFF) # Build protobuf as static .lib, but using dynamic runtime
25+
endif()
26+
endif()
27+
28+
add_definitions(-DONNX_NAMESPACE=onnx)
29+
add_definitions(-DONNX_ML)
30+
add_definitions(-DNOMINMAX)
31+
file(GLOB tensorrt_src "./*.cc" "./utils/*.cc" "./cuda/unary_elementwise_ops_impl.cu" "./*.h")
32+
add_library(TensorRTEp SHARED ${tensorrt_src})
33+
34+
if (NOT ORT_HOME)
35+
message(FATAL_ERROR "Please specify ORT_HOME, e.g. -DORT_HOME=/path/to/ort/")
36+
endif()
37+
38+
if (NOT TENSORRT_HOME)
39+
message(FATAL_ERROR "Please specify TENSORRT_HOME, e.g. -DTENSORRT_HOME=/path/to/trt/")
40+
endif()
41+
42+
# Use release mode if not specified
43+
if (NOT CMAKE_BUILD_TYPE)
44+
set(CMAKE_BUILD_TYPE "Release")
45+
endif()
46+
47+
# Add dependencies
48+
include(FetchContent)
49+
50+
# Add protobuf
51+
FetchContent_Declare(
52+
protobuf
53+
GIT_REPOSITORY https://github.com/protocolbuffers/protobuf.git
54+
GIT_TAG v21.12 # Use a specific tag or commit
55+
)
56+
57+
if (WIN32)
58+
# Sometimes, protobuf ignores CMAKE_MSVC_RUNTIME_LIBRARY. To ensure it works:
59+
set(protobuf_MSVC_STATIC_RUNTIME OFF CACHE BOOL "" FORCE)
60+
endif()
61+
62+
FetchContent_MakeAvailable(protobuf)
63+
64+
# Add ONNX
65+
FetchContent_Declare(
66+
onnx
67+
GIT_REPOSITORY https://github.com/onnx/onnx.git
68+
GIT_TAG v1.18.0 # Use a specific tag or commit
69+
)
70+
71+
FetchContent_MakeAvailable(onnx)
72+
73+
# Add GSL
74+
FetchContent_Declare(
75+
gsl
76+
GIT_REPOSITORY https://github.com/microsoft/GSL.git
77+
GIT_TAG v4.0.0 # Use a specific tag or commit
78+
)
79+
80+
FetchContent_MakeAvailable(gsl)
81+
82+
# Add flatbuffers
83+
FetchContent_Declare(
84+
flatbuffers
85+
GIT_REPOSITORY https://github.com/google/flatbuffers.git
86+
GIT_TAG v23.5.26 # Use a specific tag or commit
87+
)
88+
89+
FetchContent_MakeAvailable(flatbuffers)
90+
91+
set(DEPS_PATH "${CMAKE_BINARY_DIR}/_deps")
92+
93+
if (WIN32) # Windows
94+
set(ORT_LIB "${ORT_HOME}/lib/onnxruntime.lib")
95+
set(TRT_LIBS "${TENSORRT_HOME}/lib/nvinfer_10.lib"
96+
"${TENSORRT_HOME}/lib/nvinfer_plugin_10.lib"
97+
"${TENSORRT_HOME}/lib/nvonnxparser_10.lib")
98+
99+
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
100+
set(DEPS_LIBS ${DEPS_LIBS}
101+
"${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotobufd.lib"
102+
"${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotocd.lib")
103+
else()
104+
set(DEPS_LIBS ${DEPS_LIBS}
105+
"${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotobuf.lib"
106+
"${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotoc.lib")
107+
endif()
108+
109+
set(DEPS_LIBS "${DEPS_PATH}/flatbuffers-build/${CMAKE_BUILD_TYPE}/flatbuffers.lib"
110+
"${DEPS_PATH}/onnx-build/${CMAKE_BUILD_TYPE}/onnx.lib"
111+
"${DEPS_PATH}/onnx-build/${CMAKE_BUILD_TYPE}/onnx_proto.lib")
112+
113+
set(TRT_EP_LIB_LINK_FLAG
114+
"-DEF:${CMAKE_SOURCE_DIR}/tensorrt_execution_provider.def")
115+
116+
else() # Linux
117+
set(ORT_LIB "${ORT_HOME}/lib/libonnxruntime.so")
118+
set(TRT_LIBS "${TENSORRT_HOME}/lib/libnvinfer.so"
119+
"${TENSORRT_HOME}/lib/libnvinfer_plugin.so"
120+
"${TENSORRT_HOME}/lib/libnvonnxparser.so")
121+
set(DEPS_LIBS "${DEPS_PATH}/flatbuffers-build/libflatbuffers.a"
122+
"${DEPS_PATH}/onnx-build/libonnx.a"
123+
"${DEPS_PATH}/onnx-build/libonnx_proto.a")
124+
125+
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
126+
set(DEPS_LIBS ${DEPS_LIBS}
127+
"${DEPS_PATH}/protobuf-build/libprotobufd.a"
128+
"${DEPS_PATH}/protobuf-build/libprotocd.a")
129+
else()
130+
set(DEPS_LIBS ${DEPS_LIBS}
131+
"${DEPS_PATH}/protobuf-build/libprotobuf.a"
132+
"${DEPS_PATH}/protobuf-build/libprotoc.a")
133+
endif()
134+
endif()
135+
136+
MESSAGE(STATUS "Looking for following dependencies ...")
137+
MESSAGE(STATUS "ORT lib : ${ORT_LIB}")
138+
MESSAGE(STATUS "TRT libs : ${TRT_LIBS}")
139+
MESSAGE(STATUS "Deps libs: ${DEPS_LIBS}")
140+
141+
set_property(TARGET TensorRTEp APPEND_STRING PROPERTY LINK_FLAGS
142+
${TRT_EP_LIB_LINK_FLAG})
143+
144+
target_include_directories(TensorRTEp PUBLIC "${ORT_HOME}/include"
145+
"./utils"
146+
"/usr/local/cuda/include"
147+
"${TENSORRT_HOME}/include"
148+
"${DEPS_PATH}/flatbuffers-src/include"
149+
"${DEPS_PATH}/gsl-src/include" # GSL is header-only
150+
"${DEPS_PATH}/onnx-src"
151+
"${DEPS_PATH}/onnx-build"
152+
"${DEPS_PATH}/protobuf-src/src"
153+
)
154+
155+
target_link_libraries(TensorRTEp PUBLIC #${DEPS_LIBS}
156+
protobuf::libprotobuf onnx flatbuffers
157+
${ORT_LIB}
158+
${TRT_LIBS}
159+
CUDA::cudart
160+
)
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#pragma once
5+
#include <stdint.h>
6+
7+
namespace cuda {
8+
9+
// We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
10+
// For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
11+
#ifndef CUDA_LONG
12+
#define CUDA_LONG int32_t
13+
#endif
14+
15+
template <class INT, class INT2>
16+
inline __host__ __device__ INT CeilDiv(INT a, INT2 b) // ceil(a/b)
17+
{
18+
return (INT)(((size_t)a + (size_t)b - 1) / (size_t)b); // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[])
19+
}
20+
21+
struct GridDim {
22+
enum : CUDA_LONG {
23+
maxThreadsPerBlock = 256, // max threads per block
24+
maxElementsPerThread = 4, // max element processed per thread
25+
};
26+
};
27+
28+
template <typename InT, typename OutT, typename FuncT, int NumThreadsPerBlock, int NumElementsPerThread>
29+
__global__ void _UnaryElementWise(
30+
const InT* input_data,
31+
OutT* output_data,
32+
const FuncT functor,
33+
CUDA_LONG N) {
34+
CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
35+
InT value[NumElementsPerThread];
36+
37+
CUDA_LONG id = start;
38+
#pragma unroll
39+
for (int i = 0; i < NumElementsPerThread; i++) {
40+
if (id < N) {
41+
value[i] = input_data[id];
42+
id += NumThreadsPerBlock;
43+
}
44+
}
45+
46+
id = start;
47+
#pragma unroll
48+
for (int i = 0; i < NumElementsPerThread; i++) {
49+
if (id < N) {
50+
output_data[id] = functor(value[i]);
51+
id += NumThreadsPerBlock;
52+
}
53+
}
54+
}
55+
56+
template <typename InT, typename OutT, typename FuncT>
57+
void UnaryElementWiseImpl(
58+
cudaStream_t stream,
59+
const InT* input_data,
60+
OutT* output_data,
61+
const FuncT& func,
62+
size_t count) {
63+
if (count == 0) // special case where there's a dim value of 0 in the shape
64+
return;
65+
66+
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
67+
CUDA_LONG N = static_cast<CUDA_LONG>(count);
68+
_UnaryElementWise<InT, OutT, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
69+
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
70+
input_data,
71+
output_data,
72+
func,
73+
N);
74+
}
75+
76+
} // namespace cuda
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#include <cuda_runtime.h>
5+
#include "cu_inc/unary_elementwise_impl.cuh"
6+
7+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
8+
#include "cuda_fp8.h"
9+
#endif
10+
#include <cuda_fp16.h>
11+
12+
namespace cuda {
13+
14+
// the postfix of means the types supported by the op:
15+
// B: uint8_t
16+
// W: uint16_t
17+
// U: uint32_t
18+
// Z: uint64_t
19+
// C: int8_t
20+
// S: int16_t
21+
// I: int32_t
22+
// L: int64_t
23+
// H: float16
24+
// F: float
25+
// D: double
26+
// O: bool
27+
// X: BFloat16
28+
29+
// When casting, half needs to be converted via float type from most other types
30+
template <typename T>
31+
struct ViaTypeMap {
32+
typedef T ViaT;
33+
};
34+
35+
template <>
36+
struct ViaTypeMap<half> {
37+
typedef float ViaT;
38+
};
39+
40+
template <typename InT, typename OutT>
41+
struct OP_Cast {
42+
__device__ __inline__ OutT operator()(const InT& a) const {
43+
const bool any_float16 = std::is_same<half, InT>::value || std::is_same<half, OutT>::value;
44+
typedef typename std::conditional<any_float16, half, OutT>::type T;
45+
typedef typename ViaTypeMap<T>::ViaT ViaT;
46+
return (OutT)((ViaT)a);
47+
}
48+
};
49+
50+
#define IMPL_CAST_IMPL(InT, OutT) \
51+
void Explicit_Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) { \
52+
UnaryElementWiseImpl(stream, input_data, output_data, OP_Cast<InT, OutT>(), count); \
53+
}
54+
55+
#define IMPL_CAST_IMPL_THROW(InT, OutT) \
56+
void Explicit_Impl_Cast(cudaStream_t /*stream*/, const InT* /*input_data*/, OutT* /*output_data*/, \
57+
size_t /*count*/) { \
58+
ORT_THROW("Cast from " #InT " to " #OutT " must define saturate."); \
59+
}
60+
61+
#define IMPL_CAST_IMPL_FROM(T) \
62+
IMPL_CAST_IMPL(T, half) \
63+
IMPL_CAST_IMPL(T, float) \
64+
IMPL_CAST_IMPL(T, double) \
65+
IMPL_CAST_IMPL(T, int8_t) \
66+
IMPL_CAST_IMPL(T, int16_t) \
67+
IMPL_CAST_IMPL(T, int32_t) \
68+
IMPL_CAST_IMPL(T, int64_t) \
69+
IMPL_CAST_IMPL(T, uint8_t) \
70+
IMPL_CAST_IMPL(T, uint16_t) \
71+
IMPL_CAST_IMPL(T, uint32_t) \
72+
IMPL_CAST_IMPL(T, uint64_t) \
73+
IMPL_CAST_IMPL(T, bool) \
74+
// IMPL_CAST_IMPL(T, BFloat16)
75+
76+
IMPL_CAST_IMPL_FROM(half)
77+
IMPL_CAST_IMPL_FROM(float)
78+
IMPL_CAST_IMPL_FROM(double)
79+
IMPL_CAST_IMPL_FROM(int8_t)
80+
IMPL_CAST_IMPL_FROM(int16_t)
81+
IMPL_CAST_IMPL_FROM(int32_t)
82+
IMPL_CAST_IMPL_FROM(int64_t)
83+
IMPL_CAST_IMPL_FROM(uint8_t)
84+
IMPL_CAST_IMPL_FROM(uint16_t)
85+
IMPL_CAST_IMPL_FROM(uint32_t)
86+
IMPL_CAST_IMPL_FROM(uint64_t)
87+
IMPL_CAST_IMPL_FROM(bool)
88+
// IMPL_CAST_IMPL_FROM(BFloat16)
89+
90+
} // namespace cuda
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#pragma once
5+
6+
#include <stdint.h>
7+
#include <cuda_fp16.h>
8+
#include <cuda_runtime.h>
9+
10+
namespace cuda {
11+
12+
// Cast
13+
14+
#define DECL_IMPL_CAST(InT, OutT) \
15+
void Explicit_Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count);
16+
17+
#define DECL_IMPL_CAST_FROM(T) \
18+
DECL_IMPL_CAST(T, half) \
19+
DECL_IMPL_CAST(T, float) \
20+
DECL_IMPL_CAST(T, double) \
21+
DECL_IMPL_CAST(T, int8_t) \
22+
DECL_IMPL_CAST(T, int16_t) \
23+
DECL_IMPL_CAST(T, int32_t) \
24+
DECL_IMPL_CAST(T, int64_t) \
25+
DECL_IMPL_CAST(T, uint8_t) \
26+
DECL_IMPL_CAST(T, uint16_t) \
27+
DECL_IMPL_CAST(T, uint32_t) \
28+
DECL_IMPL_CAST(T, uint64_t) \
29+
DECL_IMPL_CAST(T, bool) \
30+
// DECL_IMPL_CAST(T, BFloat16)
31+
32+
DECL_IMPL_CAST_FROM(half)
33+
DECL_IMPL_CAST_FROM(float)
34+
DECL_IMPL_CAST_FROM(double)
35+
DECL_IMPL_CAST_FROM(int8_t)
36+
DECL_IMPL_CAST_FROM(int16_t)
37+
DECL_IMPL_CAST_FROM(int32_t)
38+
DECL_IMPL_CAST_FROM(int64_t)
39+
DECL_IMPL_CAST_FROM(uint8_t)
40+
DECL_IMPL_CAST_FROM(uint16_t)
41+
DECL_IMPL_CAST_FROM(uint32_t)
42+
DECL_IMPL_CAST_FROM(uint64_t)
43+
DECL_IMPL_CAST_FROM(bool)
44+
// DECL_IMPL_CAST_FROM(BFloat16)
45+
46+
template <typename InT, typename OutT>
47+
void Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) {
48+
Explicit_Impl_Cast(stream, input_data, output_data, count);
49+
}
50+
51+
} // namespace cuda

0 commit comments

Comments
 (0)