merge from develop: add tensorrt support for win test=develop (#19172)

wopeizl · web-flow · commit 1fd0ca82f938 · 2019-08-16T14:07:26.000+08:00
* merge from develop: add tensorrt support for win test=develop
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
@@ -88,14 +88,20 @@ if(WITH_GPU)
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 
     if(TENSORRT_FOUND)
-        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
-            message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
-        endif()
-        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-            message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
-        endif()
-        if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
-            message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+        if(WIN32)
+            if(${CUDA_VERSION_MAJOR} VERSION_LESS 9)
+                message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows")
+            endif()
+        else()
+            if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+                message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
+            endif()
+            if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+                message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+            endif()
+            if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+                message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+            endif()
         endif()
         include_directories(${TENSORRT_INCLUDE_DIR})
     endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
@@ -218,7 +218,7 @@ endif ()
 
 if (TENSORRT_FOUND)
     copy(tensorrt_lib DEPS ${inference_deps} 
-        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer*
+        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/*nvinfer*
         DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib)
 endif ()
 
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
@@ -2,14 +2,28 @@ if(NOT WITH_GPU)
     return()
 endif()
 
-set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+if(WIN32)
+    if("${TENSORRT_ROOT}" STREQUAL "")
+        message(WARNING "Please specify the TensorRT root path: TENSORRT_ROOT.")
+    endif()
+    string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}")
+    set(TR_INFER_LIB nvinfer.lib)
+    set(TR_INFER_RT nvinfer.dll)
+    set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll)
+else()
+    set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+    set(TR_INFER_LIB libnvinfer.a)
+    set(TR_INFER_RT libnvinfer.so)
+    set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so)
+endif()
+
 find_path(TENSORRT_INCLUDE_DIR NvInfer.h
     PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
     $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
     NO_DEFAULT_PATH
 )
 
-find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
+find_library(TENSORRT_LIBRARY NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
     PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
     $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
     NO_DEFAULT_PATH
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -219,7 +219,7 @@ template class AnakinOpConverter<::anakin::saber::X86,
 #define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__)   \
   extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \
   int use_converter_anakin_##op_type__##_##place_type__##_##precision_type__   \
-      __attribute__((unused)) =                                                \
+      UNUSED =                                                                 \
           Touch_anakin_##op_type__##_##place_type__##_##precision_type__();
 
 #if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE)
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -141,6 +141,10 @@ if(WITH_GPU)
     endif()
     set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
   else()
+    if (USE_TENSORRT)
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+    endif()
     set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
     set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
     set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
@@ -150,6 +154,14 @@ endif()
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 target_link_libraries(${DEMO_NAME} ${DEPS})
 if(WIN32)
+  if(USE_TENSORRT)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}
+              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
+              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+            )
+  endif()
   if(WITH_MKL)
     add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -225,7 +225,7 @@ class OpConverter {
     return 0;                                                                  \
   }
 
-#define USE_TRT_CONVERTER(op_type__)                                    \
-  extern int TouchConverterRegister_##op_type__();                      \
-  static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
+#define USE_TRT_CONVERTER(op_type__)                   \
+  extern int TouchConverterRegister_##op_type__();     \
+  static int use_op_converter_trt_##op_type__ UNUSED = \
       TouchConverterRegister_##op_type__();
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
@@ -216,8 +216,8 @@ class TensorRTEngine {
 // TensorRT has too many layers, so that is not wise to add member functions for
 // them, and an macro like this is more extensible when underlying TensorRT
 // library add new layer supports.
-#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
-  engine__->network()->add##layer__(ARGS);
+#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ...) \
+  engine__->network()->add##layer__(__VA_ARGS__);
 
 class TRTEngineManager {
  public:
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -34,6 +34,7 @@ int PReluPlugin::initialize() {
   cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
   cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
              cudaMemcpyHostToDevice);
+  return 0;
 }
 
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
@@ -68,7 +68,7 @@ class TrtPluginRegistrar {
 
 #define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func)      \
   static paddle::inference::tensorrt::plugin::TrtPluginRegistrar   \
-      trt_plugin_registrar##ctr __attribute__((unused)) =          \
+      trt_plugin_registrar##ctr UNUSED =                           \
           paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \
               name, deserialize_func)
 
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -247,6 +247,8 @@ void* GetNCCLDsoHandle() {
 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
 #endif
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
@@ -14,7 +14,9 @@ limitations under the License. */
 #pragma once
 
 #include <NvInfer.h>
+#if !defined(_WIN32)
 #include <dlfcn.h>
+#endif
 
 #include <mutex>  // NOLINT
 
@@ -34,7 +36,7 @@ extern void* tensorrt_dso_handle;
   struct DynLoad__##__name {                                            \
     template <typename... Args>                                         \
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
-      using tensorrt_func = decltype(__name(args...)) (*)(Args...);     \
+      using tensorrt_func = decltype(&::__name);                        \
       std::call_once(tensorrt_dso_flag, []() {                          \
         tensorrt_dso_handle =                                           \
             paddle::platform::dynload::GetTensorRtDsoHandle();          \
diff --git a/python/setup.py.in b/python/setup.py.in
@@ -166,6 +166,11 @@ package_data['paddle.libs']= []
 package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
 shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 
+if '${TENSORRT_FOUND}' == 'ON' and os.name == 'nt':
+    shutil.copy(os.path.join('${TENSORRT_ROOT}', 'lib', '${TR_INFER_RT}'), libs_path)
+    shutil.copy(os.path.join('${TENSORRT_ROOT}', 'lib', '${TR_INFER_PLUGIN_RT}'), libs_path)
+    package_data['paddle.libs'] += ['${TR_INFER_RT}', '${TR_INFER_PLUGIN_RT}']
+
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_SHARED_LIB}', libs_path)
     shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)

Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,7 @@ int PReluPlugin::initialize() {`
`34`	`34`	`cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());`
`35`	`35`	`cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),`
`36`	`36`	`cudaMemcpyHostToDevice);`
	`37`	`+ return 0;`
`37`	`38`	`}`
`38`	`39`
`39`	`40`	`nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,`