@@ -225,6 +225,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
225
225
);
226
226
}
227
227
CUDA4DNN_CHECK_CUDNN (cudnnSetConvolutionGroupCount (descriptor, group_count));
228
+
229
+ #if CUDNN_MAJOR >= 8
230
+ /* cuDNN 7 and below use FMA math by default. cuDNN 8 includes TF32 Tensor Ops
231
+ * in the default setting. TF32 convolutions have lower precision than FP32.
232
+ * Hence, we set the math type to CUDNN_FMA_MATH to reproduce old behavior.
233
+ */
234
+ CUDA4DNN_CHECK_CUDNN (cudnnSetConvolutionMathType (descriptor, CUDNN_FMA_MATH));
235
+ #endif
236
+
228
237
if (std::is_same<T, half>::value)
229
238
CUDA4DNN_CHECK_CUDNN (cudnnSetConvolutionMathType (descriptor, CUDNN_TENSOR_OP_MATH));
230
239
} catch (...) {
@@ -254,15 +263,49 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
254
263
*/
255
264
ConvolutionAlgorithm (
256
265
const Handle& handle,
257
- const ConvolutionDescriptor<T>& conv ,
258
- const FilterDescriptor<T>& filter ,
259
- const TensorDescriptor<T>& input ,
260
- const TensorDescriptor<T>& output )
266
+ const ConvolutionDescriptor<T>& convDesc ,
267
+ const FilterDescriptor<T>& filterDesc ,
268
+ const TensorDescriptor<T>& inputDesc ,
269
+ const TensorDescriptor<T>& outputDesc )
261
270
{
271
+ #if CUDNN_MAJOR >= 8
272
+ int requestedAlgoCount = 0 , returnedAlgoCount = 0 ;
273
+ CUDA4DNN_CHECK_CUDNN (cudnnGetConvolutionForwardAlgorithmMaxCount (handle.get (), &requestedAlgoCount));
274
+ std::vector<cudnnConvolutionFwdAlgoPerf_t> results (requestedAlgoCount);
275
+ CUDA4DNN_CHECK_CUDNN (
276
+ cudnnGetConvolutionForwardAlgorithm_v7 (
277
+ handle.get (),
278
+ inputDesc.get (), filterDesc.get (), convDesc.get (), outputDesc.get (),
279
+ requestedAlgoCount,
280
+ &returnedAlgoCount,
281
+ &results[0 ]
282
+ )
283
+ );
284
+
285
+ size_t free_memory, total_memory;
286
+ CUDA4DNN_CHECK_CUDA (cudaMemGetInfo (&free_memory, &total_memory));
287
+
288
+ bool found_conv_algorithm = false ;
289
+ for (int i = 0 ; i < returnedAlgoCount; i++)
290
+ {
291
+ if (results[i].status == CUDNN_STATUS_SUCCESS &&
292
+ results[i].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
293
+ results[i].memory < free_memory)
294
+ {
295
+ found_conv_algorithm = true ;
296
+ algo = results[i].algo ;
297
+ workspace_size = results[i].memory ;
298
+ break ;
299
+ }
300
+ }
301
+
302
+ if (!found_conv_algorithm)
303
+ CV_Error (cv::Error::GpuApiCallError, " cuDNN did not return a suitable algorithm for convolution." );
304
+ #else
262
305
CUDA4DNN_CHECK_CUDNN (
263
306
cudnnGetConvolutionForwardAlgorithm (
264
307
handle.get (),
265
- input .get (), filter .get (), conv .get (), output .get (),
308
+ inputDesc .get (), filterDesc .get (), convDesc .get (), outputDesc .get (),
266
309
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
267
310
0 , /* no memory limit */
268
311
&algo
@@ -272,10 +315,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
272
315
CUDA4DNN_CHECK_CUDNN (
273
316
cudnnGetConvolutionForwardWorkspaceSize (
274
317
handle.get (),
275
- input .get (), filter .get (), conv .get (), output .get (),
318
+ inputDesc .get (), filterDesc .get (), convDesc .get (), outputDesc .get (),
276
319
algo, &workspace_size
277
320
)
278
321
);
322
+ #endif
279
323
}
280
324
281
325
ConvolutionAlgorithm& operator =(const ConvolutionAlgorithm&) = default ;
0 commit comments