enable tensor core for conv cudnn

kexinzhao · kexinzhao · commit 187ba0878961 · 2018-04-03T17:50:55.000-07:00
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -128,10 +128,32 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
         workspace_size_limit, &algo));
+
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
+      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    } else {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+    }
+#endif
+
     // get workspace size able to allocate
     PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // It is possible for float16 on Volta GPU to allocate more memory than
+    // the limit because the algo is overrided to use tensor core.
+    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
+                      "workspace_size to be allocated exceeds the limit");
+
     // Allocate on GPU memory
     platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
@@ -257,9 +257,11 @@ class ScopedConvolutionDescriptor {
     }
 #endif
 
+    cudnnDataType_t compute_type =
+        (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
     PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
         desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
-        CUDNN_CROSS_CORRELATION, type));
+        CUDNN_CROSS_CORRELATION, compute_type));
     return desc_;
   }
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <cudnn.h>
 #include <dlfcn.h>
-#include <mutex>
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -140,7 +139,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 #if CUDNN_VERSION >= 7001
 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
-  __macro(cudnnSetConvolutionGroupCount);
+  __macro(cudnnSetConvolutionGroupCount);  \
+  __macro(cudnnSetConvolutionMathType);
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 

Original file line number	Diff line number	Diff line change
`@@ -257,9 +257,11 @@ class ScopedConvolutionDescriptor {`
`257`	`257`	`}`
`258`	`258`	`#endif`
`259`	`259`
	`260`	`+ cudnnDataType_t compute_type =`
	`261`	`+ (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;`
`260`	`262`	`PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(`
`261`	`263`	`desc_, pads.size(), pads.data(), strides.data(), dilations.data(),`
`262`		`- CUDNN_CROSS_CORRELATION, type));`
	`264`	`+ CUDNN_CROSS_CORRELATION, compute_type));`
`263`	`265`	`return desc_;`
`264`	`266`	`}`
`265`	`267`