|
47 | 47 | #include "op_cuda.hpp"
|
48 | 48 |
|
49 | 49 | #ifdef HAVE_CUDA
|
50 |
| -#include "cuda4dnn/primitives/eltwise.hpp" |
| 50 | +#include "cuda4dnn/init.hpp" |
| 51 | +#include "cuda4dnn/primitives/eltwise.hpp" // required by fuseLayers |
51 | 52 | #endif
|
52 | 53 |
|
53 | 54 | #include "halide_scheduler.hpp"
|
|
66 | 67 | #include <opencv2/core/utils/configuration.private.hpp>
|
67 | 68 | #include <opencv2/core/utils/logger.hpp>
|
68 | 69 |
|
69 |
| -#include <opencv2/core/cuda.hpp> |
70 |
| - |
71 | 70 | namespace cv {
|
72 | 71 | namespace dnn {
|
73 | 72 | CV__DNN_INLINE_NS_BEGIN
|
@@ -159,23 +158,6 @@ class BackendRegistry
|
159 | 158 | }
|
160 | 159 | #endif
|
161 | 160 |
|
162 |
| -#ifdef HAVE_CUDA |
163 |
| - static inline bool cudaDeviceSupportsFp16() { |
164 |
| - if (cv::cuda::getCudaEnabledDeviceCount() <= 0) |
165 |
| - return false; |
166 |
| - const int devId = cv::cuda::getDevice(); |
167 |
| - if (devId<0) |
168 |
| - return false; |
169 |
| - cv::cuda::DeviceInfo dev_info(devId); |
170 |
| - if (!dev_info.isCompatible()) |
171 |
| - return false; |
172 |
| - int version = dev_info.majorVersion() * 10 + dev_info.minorVersion(); |
173 |
| - if (version < 53) |
174 |
| - return false; |
175 |
| - return true; |
176 |
| - } |
177 |
| -#endif |
178 |
| - |
179 | 161 | private:
|
180 | 162 | BackendRegistry()
|
181 | 163 | {
|
@@ -247,9 +229,10 @@ class BackendRegistry
|
247 | 229 | #endif
|
248 | 230 |
|
249 | 231 | #ifdef HAVE_CUDA
|
250 |
| - if (haveCUDA()) { |
| 232 | + if (haveCUDA() && cuda4dnn::isDeviceCompatible()) |
| 233 | + { |
251 | 234 | backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA));
|
252 |
| - if (cudaDeviceSupportsFp16()) |
| 235 | + if (cuda4dnn::doesDeviceSupportFP16()) |
253 | 236 | backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16));
|
254 | 237 | }
|
255 | 238 | #endif
|
@@ -1189,19 +1172,6 @@ struct Net::Impl : public detail::NetImplBase
|
1189 | 1172 | preferableBackend = DNN_BACKEND_DEFAULT;
|
1190 | 1173 | preferableTarget = DNN_TARGET_CPU;
|
1191 | 1174 | skipInfEngineInit = false;
|
1192 |
| - |
1193 |
| -#ifdef HAVE_CUDA |
1194 |
| - if (cv::cuda::getCudaEnabledDeviceCount() > 0) |
1195 |
| - { |
1196 |
| - cuda4dnn::csl::CSLContext context; |
1197 |
| - context.stream = cuda4dnn::csl::Stream(true); |
1198 |
| - context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream); |
1199 |
| - context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream); |
1200 |
| - |
1201 |
| - auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers |
1202 |
| - cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream))); |
1203 |
| - } |
1204 |
| -#endif |
1205 | 1175 | }
|
1206 | 1176 |
|
1207 | 1177 | Ptr<DataLayer> netInputLayer;
|
@@ -1300,13 +1270,6 @@ struct Net::Impl : public detail::NetImplBase
|
1300 | 1270 | }
|
1301 | 1271 |
|
1302 | 1272 | Ptr<BackendWrapper> wrapper = wrapMat(preferableBackend, preferableTarget, host);
|
1303 |
| -#ifdef HAVE_CUDA |
1304 |
| - if (preferableBackend == DNN_BACKEND_CUDA) |
1305 |
| - { |
1306 |
| - auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>(); |
1307 |
| - cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream); |
1308 |
| - } |
1309 |
| -#endif |
1310 | 1273 | backendWrappers[data] = wrapper;
|
1311 | 1274 | return wrapper;
|
1312 | 1275 | }
|
@@ -2374,10 +2337,57 @@ struct Net::Impl : public detail::NetImplBase
|
2374 | 2337 | #endif
|
2375 | 2338 | }
|
2376 | 2339 |
|
2377 |
| - void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_) { |
| 2340 | + void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_) |
| 2341 | + { |
2378 | 2342 | CV_Assert(haveCUDA());
|
| 2343 | + CV_Assert(preferableBackend == DNN_BACKEND_CUDA); |
2379 | 2344 |
|
2380 | 2345 | #ifdef HAVE_CUDA
|
| 2346 | + if (cuda4dnn::getDeviceCount() <= 0) |
| 2347 | + CV_Error(Error::StsError, "No CUDA capable device found."); |
| 2348 | + |
| 2349 | + if (cuda4dnn::getDevice() < 0) |
| 2350 | + CV_Error(Error::StsError, "No CUDA capable device selected."); |
| 2351 | + |
| 2352 | + if (!cuda4dnn::isDeviceCompatible()) |
| 2353 | + CV_Error(Error::GpuNotSupported, "OpenCV was not built to work with the selected device. Please check CUDA_ARCH_PTX or CUDA_ARCH_BIN in your build configuration."); |
| 2354 | + |
| 2355 | + if (preferableTarget == DNN_TARGET_CUDA_FP16 && !cuda4dnn::doesDeviceSupportFP16()) |
| 2356 | + CV_Error(Error::StsError, "The selected CUDA device does not support FP16 operations."); |
| 2357 | + |
| 2358 | + if (!cudaInfo) |
| 2359 | + { |
| 2360 | + cuda4dnn::csl::CSLContext context; |
| 2361 | + context.stream = cuda4dnn::csl::Stream(true); |
| 2362 | + context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream); |
| 2363 | + context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream); |
| 2364 | + |
| 2365 | + auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers |
| 2366 | + cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream))); |
| 2367 | + cuda4dnn::checkVersions(); |
| 2368 | + } |
| 2369 | + |
| 2370 | + cudaInfo->workspace = cuda4dnn::csl::Workspace(); // release workspace memory if any |
| 2371 | + |
| 2372 | + for (auto& layer : layers) |
| 2373 | + { |
| 2374 | + auto& ld = layer.second; |
| 2375 | + if (ld.id == 0) |
| 2376 | + { |
| 2377 | + for (auto& wrapper : ld.inputBlobsWrappers) |
| 2378 | + { |
| 2379 | + auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>(); |
| 2380 | + cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream); |
| 2381 | + } |
| 2382 | + } |
| 2383 | + |
| 2384 | + for (auto& wrapper : ld.outputBlobsWrappers) |
| 2385 | + { |
| 2386 | + auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>(); |
| 2387 | + cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream); |
| 2388 | + } |
| 2389 | + } |
| 2390 | + |
2381 | 2391 | for (auto& layer : layers)
|
2382 | 2392 | {
|
2383 | 2393 | auto& ld = layer.second;
|
@@ -2653,11 +2663,11 @@ struct Net::Impl : public detail::NetImplBase
|
2653 | 2663 | if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
|
2654 | 2664 | {
|
2655 | 2665 | // we create a temporary backend node for eltwise layer to obtain the eltwise configuration
|
2656 |
| - auto context = cudaInfo->context; /* make a copy so that initCUDA doesn't modify cudaInfo */ |
| 2666 | + cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp does not use the context during init |
2657 | 2667 | const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
|
2658 | 2668 | const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
|
2659 | 2669 | if (eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
|
2660 |
| - nextEltwiseLayer = Ptr<EltwiseLayer>(); |
| 2670 | + nextEltwiseLayer = Ptr<EltwiseLayer>(); |
2661 | 2671 |
|
2662 | 2672 | // check for variable channels
|
2663 | 2673 | auto& inputs = nextData->inputBlobs;
|
|
0 commit comments