microsoft
diff --git a/‎plugin_execution_providers/tensorrt/CMakeLists.txt‎
Lines changed: 119 additions & 0 deletions b/‎plugin_execution_providers/tensorrt/CMakeLists.txt‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎plugin_execution_providers/tensorrt/cuda/cu_inc/unary_elementwise_impl.cuh‎
Lines changed: 78 additions & 0 deletions b/‎plugin_execution_providers/tensorrt/cuda/cu_inc/unary_elementwise_impl.cuh‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎plugin_execution_providers/tensorrt/cuda/unary_elementwise_ops_impl.cu‎
Lines changed: 93 additions & 0 deletions b/‎plugin_execution_providers/tensorrt/cuda/unary_elementwise_ops_impl.cu‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎plugin_execution_providers/tensorrt/cuda/unary_elementwise_ops_impl.h‎
Lines changed: 54 additions & 0 deletions b/‎plugin_execution_providers/tensorrt/cuda/unary_elementwise_ops_impl.h‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎plugin_execution_providers/tensorrt/ep_abi_utils.cc‎
Lines changed: 12 additions & 0 deletions b/‎plugin_execution_providers/tensorrt/ep_abi_utils.cc‎
Lines changed: 12 additions & 0 deletions
@@ -0,0 +1,119 @@
+# usage:
+# cd build/
+# cmake -S ../ -B ./ -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DORT_HOME=/home/lochi/repos/ort -DTENSORRT_HOME=/home/lochi/tensorrt/TensorRT-10.3.0.26 (see the result of "nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits")
+# cmake --build ./ --config Debug
+cmake_minimum_required(VERSION 3.26)
+project(TensorRTEp VERSION 1.0)
+set(CMAKE_CXX_STANDARD 17)
+enable_language(CUDA)
+file(TO_CMAKE_PATH CUDAToolkit_ROOT "/usr/local/cuda")
+find_package(CUDAToolkit REQUIRED)
+
+add_definitions(-DONNX_NAMESPACE=onnx)
+add_definitions(-DONNX_ML)
+add_definitions(-DNV_TENSORRT_MAJOR=10)
+add_definitions(-DNOMINMAX)
+file(GLOB tensorrt_src "./*.cc" "./utils/*.cc" "./cuda/unary_elementwise_ops_impl.cu")
+add_library(TensorRTEp SHARED ${tensorrt_src})
+
+if (NOT ORT_HOME)
+  message(FATAL_ERROR "Please specify ORT_HOME, e.g. -DORT_HOME=/path/to/ort/")
+endif()
+
+if (NOT TENSORRT_HOME)
+  message(FATAL_ERROR "Please specify TENSORRT_HOME, e.g. -DTENSORRT_HOME=/path/to/trt/")
+endif()
+
+# Use release mode if not specified
+if (NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+# Add dependencies
+include(FetchContent)
+
+# Add GSL
+FetchContent_Declare(
+  gsl
+  GIT_REPOSITORY https://github.com/microsoft/GSL.git
+  GIT_TAG        v4.0.0  # Use a specific tag or commit
+)
+
+FetchContent_MakeAvailable(gsl)
+
+# Add flatbuffers
+FetchContent_Declare(
+  flatbuffers
+  GIT_REPOSITORY https://github.com/google/flatbuffers.git
+  GIT_TAG        v23.5.26  # Use a specific tag or commit
+)
+
+FetchContent_MakeAvailable(flatbuffers)
+
+if (WIN32)
+  set(PLATFORM "Windows")
+  set(ORT_LIB "${ORT_HOME}/build/${PLATFORM}/${CMAKE_BUILD_TYPE}/${CMAKE_BUILD_TYPE}/onnxruntime.lib")
+  set(DEPS_PATH "${ORT_HOME}/build/${PLATFORM}/${CMAKE_BUILD_TYPE}/_deps")
+  set(TRT_LIBS "${TENSORRT_HOME}/lib/nvinfer_10.lib" 
+               "${TENSORRT_HOME}/lib/nvinfer_plugin_10.lib"
+               "${TENSORRT_HOME}/lib/nvonnxparser_10.lib")
+  set(DEPS_LIBS "${DEPS_PATH}/flatbuffers-build/${CMAKE_BUILD_TYPE}/flatbuffers.lib"
+               "${DEPS_PATH}/onnx-build/${CMAKE_BUILD_TYPE}/onnx.lib"
+               "${DEPS_PATH}/onnx-build/${CMAKE_BUILD_TYPE}/onnx_proto.lib")
+  
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(DEPS_LIBS ${DEPS_LIBS}
+                 "${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotobufd.lib"
+                 "${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotocd.lib")
+  else()
+    set(DEPS_LIBS ${DEPS_LIBS} 
+                 "${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotobuf.lib"
+                 "${DEPS_PATH}/protobuf-build/${CMAKE_BUILD_TYPE}/libprotoc.lib")
+  endif()
+else()
+  set(PLATFORM "Linux")
+  set(ORT_LIB "${ORT_HOME}/build/${PLATFORM}/${CMAKE_BUILD_TYPE}/libonnxruntime.so")
+  set(DEPS_PATH "${ORT_HOME}/build/${PLATFORM}/${CMAKE_BUILD_TYPE}/_deps")
+  set(TRT_LIBS "${TENSORRT_HOME}/lib/libnvinfer.so"
+               "${TENSORRT_HOME}/lib/libnvinfer_plugin.so"
+               "${TENSORRT_HOME}/lib/libnvonnxparser.so")
+  set(DEPS_LIBS "${DEPS_PATH}/flatbuffers-build/libflatbuffers.a"
+               "${DEPS_PATH}/onnx-build/libonnx.a"
+               "${DEPS_PATH}/onnx-build/libonnx_proto.a")
+  
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(DEPS_LIBS ${DEPS_LIBS}
+                 "${DEPS_PATH}/protobuf-build/libprotobufd.a"
+                 "${DEPS_PATH}/protobuf-build/libprotocd.a")
+  else()
+    set(DEPS_LIBS ${DEPS_LIBS}
+                 "${DEPS_PATH}/protobuf-build/libprotobuf.a"
+                 "${DEPS_PATH}/protobuf-build/libprotoc.a")
+  endif()
+endif()
+
+MESSAGE(STATUS "Looking for following dependencies ...")
+MESSAGE(STATUS "Platform : ${PLATFORM}")
+MESSAGE(STATUS "ORT home : ${ORT_HOME}")
+MESSAGE(STATUS "ORT lib  : ${ORT_LIB}")
+MESSAGE(STATUS "Deps path: ${DEPS_PATH}")
+MESSAGE(STATUS "Deps libs: ${DEPS_LIBS}")
+MESSAGE(STATUS "TRT libs : ${TRT_LIBS}")
+
+target_include_directories(TensorRTEp PUBLIC "${ORT_HOME}/include/onnxruntime/core/session/"
+                                             "./utils"
+                                             "/usr/local/cuda/include"
+                                             ${TENSORRT_HOME}/include
+                                             "${DEPS_PATH}/flatbuffers-src/include"
+                                             "${DEPS_PATH}/gsl-src/include"
+                                             "${DEPS_PATH}/onnx-src"
+                                             "${DEPS_PATH}/onnx-build"
+                                             "${DEPS_PATH}/protobuf-src/src"
+)
+
+target_link_libraries(TensorRTEp PUBLIC ${ORT_LIB}
+                                        ${TRT_LIBS}
+                                        CUDA::cudart
+                                        ${DEPS_LIBS}
+                                        GSL
+                                        flatbuffers)
@@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+
+namespace onnxruntime {
+namespace cuda {
+
+// We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
+// For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
+#ifndef CUDA_LONG
+#define CUDA_LONG int32_t
+#endif
+
+template <class INT, class INT2>
+inline __host__ __device__ INT CeilDiv(INT a, INT2 b)  // ceil(a/b)
+{
+  return (INT)(((size_t)a + (size_t)b - 1) / (size_t)b);  // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[])
+}
+
+struct GridDim {
+  enum : CUDA_LONG {
+    maxThreadsPerBlock = 256,  // max threads per block
+    maxElementsPerThread = 4,  // max element processed per thread
+  };
+};
+
+template <typename InT, typename OutT, typename FuncT, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void _UnaryElementWise(
+    const InT* input_data,
+    OutT* output_data,
+    const FuncT functor,
+    CUDA_LONG N) {
+  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+  InT value[NumElementsPerThread];
+
+  CUDA_LONG id = start;
+  #pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      value[i] = input_data[id];
+      id += NumThreadsPerBlock;
+    }
+  }
+
+  id = start;
+  #pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      output_data[id] = functor(value[i]);
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <typename InT, typename OutT, typename FuncT>
+void UnaryElementWiseImpl(
+    cudaStream_t stream,
+    const InT* input_data,
+    OutT* output_data,
+    const FuncT& func,
+    size_t count) {
+  if (count == 0)  // special case where there's a dim value of 0 in the shape
+    return;
+
+  int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  CUDA_LONG N = static_cast<CUDA_LONG>(count);
+  _UnaryElementWise<InT, OutT, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+      <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+          input_data,
+          output_data,
+          func,
+          N);
+}
+
+}  // namespace cuda
+}  // namespace onnxruntime
@@ -0,0 +1,93 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <cuda_runtime.h>
+#include "cu_inc/unary_elementwise_impl.cuh"
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+#include "cuda_fp8.h"
+#endif
+#include <cuda_fp16.h>
+
+namespace onnxruntime {
+
+namespace cuda {
+
+// the postfix of means the types supported by the op:
+// B: uint8_t
+// W: uint16_t
+// U: uint32_t
+// Z: uint64_t
+// C: int8_t
+// S: int16_t
+// I: int32_t
+// L: int64_t
+// H: float16
+// F: float
+// D: double
+// O: bool
+// X: BFloat16
+
+// When casting, half needs to be converted via float type from most other types
+template <typename T>
+struct ViaTypeMap {
+  typedef T ViaT;
+};
+
+template <>
+struct ViaTypeMap<half> {
+  typedef float ViaT;
+};
+
+template <typename InT, typename OutT>
+struct OP_Cast {
+  __device__ __inline__ OutT operator()(const InT& a) const {
+    const bool any_float16 = std::is_same<half, InT>::value || std::is_same<half, OutT>::value;
+    typedef typename std::conditional<any_float16, half, OutT>::type T;
+    typedef typename ViaTypeMap<T>::ViaT ViaT;
+    return (OutT)((ViaT)a);
+  }
+};
+
+#define IMPL_CAST_IMPL(InT, OutT)                                                                        \
+  void Explicit_Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) { \
+    UnaryElementWiseImpl(stream, input_data, output_data, OP_Cast<InT, OutT>(), count);                  \
+  }
+
+#define IMPL_CAST_IMPL_THROW(InT, OutT)                                                              \
+  void Explicit_Impl_Cast(cudaStream_t /*stream*/, const InT* /*input_data*/, OutT* /*output_data*/, \
+                          size_t /*count*/) {                                                        \
+    ORT_THROW("Cast from " #InT " to " #OutT " must define saturate.");                              \
+  }
+
+#define IMPL_CAST_IMPL_FROM(T) \
+  IMPL_CAST_IMPL(T, half)      \
+  IMPL_CAST_IMPL(T, float)     \
+  IMPL_CAST_IMPL(T, double)    \
+  IMPL_CAST_IMPL(T, int8_t)    \
+  IMPL_CAST_IMPL(T, int16_t)   \
+  IMPL_CAST_IMPL(T, int32_t)   \
+  IMPL_CAST_IMPL(T, int64_t)   \
+  IMPL_CAST_IMPL(T, uint8_t)   \
+  IMPL_CAST_IMPL(T, uint16_t)  \
+  IMPL_CAST_IMPL(T, uint32_t)  \
+  IMPL_CAST_IMPL(T, uint64_t)  \
+  IMPL_CAST_IMPL(T, bool)      \
+  //IMPL_CAST_IMPL(T, BFloat16)
+
+IMPL_CAST_IMPL_FROM(half)
+IMPL_CAST_IMPL_FROM(float)
+IMPL_CAST_IMPL_FROM(double)
+IMPL_CAST_IMPL_FROM(int8_t)
+IMPL_CAST_IMPL_FROM(int16_t)
+IMPL_CAST_IMPL_FROM(int32_t)
+IMPL_CAST_IMPL_FROM(int64_t)
+IMPL_CAST_IMPL_FROM(uint8_t)
+IMPL_CAST_IMPL_FROM(uint16_t)
+IMPL_CAST_IMPL_FROM(uint32_t)
+IMPL_CAST_IMPL_FROM(uint64_t)
+IMPL_CAST_IMPL_FROM(bool)
+//IMPL_CAST_IMPL_FROM(BFloat16)
+
+}  // namespace cuda
+}  // namespace onnxruntime
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+namespace onnxruntime {
+namespace cuda {
+
+// Cast
+
+#define DECL_IMPL_CAST(InT, OutT) \
+  void Explicit_Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count);
+
+#define DECL_IMPL_CAST_FROM(T) \
+  DECL_IMPL_CAST(T, half)      \
+  DECL_IMPL_CAST(T, float)     \
+  DECL_IMPL_CAST(T, double)    \
+  DECL_IMPL_CAST(T, int8_t)    \
+  DECL_IMPL_CAST(T, int16_t)   \
+  DECL_IMPL_CAST(T, int32_t)   \
+  DECL_IMPL_CAST(T, int64_t)   \
+  DECL_IMPL_CAST(T, uint8_t)   \
+  DECL_IMPL_CAST(T, uint16_t)  \
+  DECL_IMPL_CAST(T, uint32_t)  \
+  DECL_IMPL_CAST(T, uint64_t)  \
+  DECL_IMPL_CAST(T, bool)      \
+  //DECL_IMPL_CAST(T, BFloat16)
+
+DECL_IMPL_CAST_FROM(half)
+DECL_IMPL_CAST_FROM(float)
+DECL_IMPL_CAST_FROM(double)
+DECL_IMPL_CAST_FROM(int8_t)
+DECL_IMPL_CAST_FROM(int16_t)
+DECL_IMPL_CAST_FROM(int32_t)
+DECL_IMPL_CAST_FROM(int64_t)
+DECL_IMPL_CAST_FROM(uint8_t)
+DECL_IMPL_CAST_FROM(uint16_t)
+DECL_IMPL_CAST_FROM(uint32_t)
+DECL_IMPL_CAST_FROM(uint64_t)
+DECL_IMPL_CAST_FROM(bool)
+//DECL_IMPL_CAST_FROM(BFloat16)
+
+template <typename InT, typename OutT>
+void Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) {
+  Explicit_Impl_Cast(stream, input_data, output_data, count);
+}
+
+}  // namespace cuda
+
+}  // namespace onnxruntime
@@ -0,0 +1,12 @@
+#define ORT_API_MANUAL_INIT
+#include "onnxruntime_cxx_api.h"
+#undef ORT_API_MANUAL_INIT
+
+#include <gsl/gsl>
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+