Skip to content

Commit 62a6302

Browse files
committed
add cuDNN 8 support
1 parent 206c843 commit 62a6302

File tree

3 files changed

+91
-16
lines changed

3 files changed

+91
-16
lines changed

modules/dnn/CMakeLists.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,10 @@ if(OPENCV_DNN_OPENCL AND HAVE_OPENCL)
2121
add_definitions(-DCV_OCL4DNN=1)
2222
endif()
2323

24-
if(NOT DEFINED OPENCV_DNN_CUDA AND HAVE_CUDNN AND CUDNN_VERSION VERSION_LESS 8.0)
25-
message(STATUS "DNN: CUDNN 8.0 is not supported yes. Details: https://github.com/opencv/opencv/issues/17496")
26-
endif()
2724
ocv_option(OPENCV_DNN_CUDA "Build with CUDA support"
2825
HAVE_CUDA
2926
AND HAVE_CUBLAS
3027
AND HAVE_CUDNN
31-
AND CUDNN_VERSION VERSION_LESS 8.0
3228
)
3329

3430
if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)

modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
225225
);
226226
}
227227
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
228+
229+
#if CUDNN_MAJOR >= 8
230+
/* cuDNN 7 and below use FMA math by default. cuDNN 8 includes TF32 Tensor Ops
231+
* in the default setting. TF32 convolutions have lower precision than FP32.
232+
* Hence, we set the math type to CUDNN_FMA_MATH to reproduce old behavior.
233+
*/
234+
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_FMA_MATH));
235+
#endif
236+
228237
if (std::is_same<T, half>::value)
229238
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH));
230239
} catch (...) {
@@ -254,15 +263,49 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
254263
*/
255264
ConvolutionAlgorithm(
256265
const Handle& handle,
257-
const ConvolutionDescriptor<T>& conv,
258-
const FilterDescriptor<T>& filter,
259-
const TensorDescriptor<T>& input,
260-
const TensorDescriptor<T>& output)
266+
const ConvolutionDescriptor<T>& convDesc,
267+
const FilterDescriptor<T>& filterDesc,
268+
const TensorDescriptor<T>& inputDesc,
269+
const TensorDescriptor<T>& outputDesc)
261270
{
271+
#if CUDNN_MAJOR >= 8
272+
int requestedAlgoCount = 0, returnedAlgoCount = 0;
273+
CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
274+
std::vector<cudnnConvolutionFwdAlgoPerf_t> results(requestedAlgoCount);
275+
CUDA4DNN_CHECK_CUDNN(
276+
cudnnGetConvolutionForwardAlgorithm_v7(
277+
handle.get(),
278+
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
279+
requestedAlgoCount,
280+
&returnedAlgoCount,
281+
&results[0]
282+
)
283+
);
284+
285+
size_t free_memory, total_memory;
286+
CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
287+
288+
bool found_conv_algorithm = false;
289+
for (int i = 0; i < returnedAlgoCount; i++)
290+
{
291+
if (results[i].status == CUDNN_STATUS_SUCCESS &&
292+
results[i].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
293+
results[i].memory < free_memory)
294+
{
295+
found_conv_algorithm = true;
296+
algo = results[i].algo;
297+
workspace_size = results[i].memory;
298+
break;
299+
}
300+
}
301+
302+
if (!found_conv_algorithm)
303+
CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for convolution.");
304+
#else
262305
CUDA4DNN_CHECK_CUDNN(
263306
cudnnGetConvolutionForwardAlgorithm(
264307
handle.get(),
265-
input.get(), filter.get(), conv.get(), output.get(),
308+
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
266309
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
267310
0, /* no memory limit */
268311
&algo
@@ -272,10 +315,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
272315
CUDA4DNN_CHECK_CUDNN(
273316
cudnnGetConvolutionForwardWorkspaceSize(
274317
handle.get(),
275-
input.get(), filter.get(), conv.get(), output.get(),
318+
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
276319
algo, &workspace_size
277320
)
278321
);
322+
#endif
279323
}
280324

281325
ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;

modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,49 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
3030

3131
TransposeConvolutionAlgorithm(
3232
const Handle& handle,
33-
const ConvolutionDescriptor<T>& conv,
34-
const FilterDescriptor<T>& filter,
35-
const TensorDescriptor<T>& input,
36-
const TensorDescriptor<T>& output)
33+
const ConvolutionDescriptor<T>& convDesc,
34+
const FilterDescriptor<T>& filterDesc,
35+
const TensorDescriptor<T>& inputDesc,
36+
const TensorDescriptor<T>& outputDesc)
3737
{
38+
#if CUDNN_MAJOR >= 8
39+
int requestedAlgoCount = 0, returnedAlgoCount = 0;
40+
CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
41+
std::vector<cudnnConvolutionBwdDataAlgoPerf_t> results(requestedAlgoCount);
42+
CUDA4DNN_CHECK_CUDNN(
43+
cudnnGetConvolutionBackwardDataAlgorithm_v7(
44+
handle.get(),
45+
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
46+
requestedAlgoCount,
47+
&returnedAlgoCount,
48+
&results[0]
49+
)
50+
);
51+
52+
size_t free_memory, total_memory;
53+
CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
54+
55+
bool found_conv_algorithm = false;
56+
for (int i = 0; i < returnedAlgoCount; i++)
57+
{
58+
if (results[i].status == CUDNN_STATUS_SUCCESS &&
59+
results[i].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
60+
results[i].memory < free_memory)
61+
{
62+
found_conv_algorithm = true;
63+
dalgo = results[i].algo;
64+
workspace_size = results[i].memory;
65+
break;
66+
}
67+
}
68+
69+
if (!found_conv_algorithm)
70+
CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for transpose convolution.");
71+
#else
3872
CUDA4DNN_CHECK_CUDNN(
3973
cudnnGetConvolutionBackwardDataAlgorithm(
4074
handle.get(),
41-
filter.get(), input.get(), conv.get(), output.get(),
75+
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
4276
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
4377
0, /* no memory limit */
4478
&dalgo
@@ -48,10 +82,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
4882
CUDA4DNN_CHECK_CUDNN(
4983
cudnnGetConvolutionBackwardDataWorkspaceSize(
5084
handle.get(),
51-
filter.get(), input.get(), conv.get(), output.get(),
85+
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
5286
dalgo, &workspace_size
5387
)
5488
);
89+
#endif
5590
}
5691

5792
TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default;

0 commit comments

Comments
 (0)