Skip to content

Commit 988bc80

Browse files
committed
Merge pull request opencv#17748 from YashasSamaga:cuda4dnn-data-parallel
2 parents 52ac366 + 4988e13 commit 988bc80

File tree

3 files changed

+95
-10
lines changed

3 files changed

+95
-10
lines changed

modules/dnn/src/cuda4dnn/csl/event.hpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
3333
/** if \p create is `true`, a new event will be created; otherwise, an empty event object is created */
3434
Event(bool create, bool timing_event = false) : event{nullptr} {
3535
if (create) {
36-
unsigned int flags = cudaEventBlockingSync | (timing_event ? 0 : cudaEventDisableTiming);
36+
unsigned int flags = (timing_event ? 0 : cudaEventDisableTiming);
3737
CUDA4DNN_CHECK_CUDA(cudaEventCreateWithFlags(&event, flags));
3838
}
3939
}
@@ -60,6 +60,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
6060

6161
/** mark a point in \p stream */
6262
void record(const Stream& stream) {
63+
CV_Assert(stream);
6364
CUDA4DNN_CHECK_CUDA(cudaEventRecord(event, stream.get()));
6465
}
6566

@@ -85,12 +86,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
8586
};
8687

8788
/** makes a stream wait on an event */
88-
void StreamWaitOnEvent(const Stream& stream, const Event& event) {
89+
inline void StreamWaitOnEvent(const Stream& stream, const Event& event) {
90+
CV_Assert(stream);
8991
CUDA4DNN_CHECK_CUDA(cudaStreamWaitEvent(stream.get(), event.get(), 0));
9092
}
9193

9294
/** returns the time elapsed between two events in milliseconds */
93-
float TimeElapsedBetweenEvents(const Event& start, const Event& end) {
95+
inline float TimeElapsedBetweenEvents(const Event& start, const Event& end) {
9496
float temp;
9597
CUDA4DNN_CHECK_CUDA(cudaEventElapsedTime(&temp, start.get(), end.get()));
9698
return temp;

modules/dnn/src/dnn.cpp

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,13 @@ struct LayerData
585585
std::vector<Ptr<BackendWrapper> > inputBlobsWrappers;
586586
std::vector<Ptr<BackendWrapper> > internalBlobsWrappers;
587587

588+
#ifdef HAVE_CUDA
589+
/* output ids which must be transferred to the host in the background
590+
* after the completion of the forward pass of the layer
591+
*/
592+
std::vector<int> cudaD2HBackgroundTransfers;
593+
#endif
594+
588595
Ptr<Layer> layerInstance;
589596
std::vector<Mat> outputBlobs;
590597
std::vector<Mat*> inputBlobs;
@@ -1187,7 +1194,8 @@ struct Net::Impl : public detail::NetImplBase
11871194
context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
11881195
context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
11891196

1190-
cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context)));
1197+
auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers
1198+
cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream)));
11911199
}
11921200
#endif
11931201
}
@@ -1215,8 +1223,10 @@ struct Net::Impl : public detail::NetImplBase
12151223
#ifdef HAVE_CUDA
12161224
struct CudaInfo_t
12171225
{
1218-
CudaInfo_t(cuda4dnn::csl::CSLContext ctxt) : context(std::move(ctxt)) { }
1226+
CudaInfo_t(cuda4dnn::csl::CSLContext ctxt, cuda4dnn::csl::Stream d2h_stream_)
1227+
: context(std::move(ctxt)), d2h_stream(std::move(d2h_stream_)) { }
12191228
cuda4dnn::csl::CSLContext context;
1229+
cuda4dnn::csl::Stream d2h_stream;
12201230
cuda4dnn::csl::Workspace workspace;
12211231
};
12221232

@@ -1290,7 +1300,7 @@ struct Net::Impl : public detail::NetImplBase
12901300
if (preferableBackend == DNN_BACKEND_CUDA)
12911301
{
12921302
auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
1293-
cudaWrapper->setStream(cudaInfo->context.stream);
1303+
cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
12941304
}
12951305
#endif
12961306
backendWrappers[data] = wrapper;
@@ -1630,7 +1640,7 @@ struct Net::Impl : public detail::NetImplBase
16301640
else if (preferableBackend == DNN_BACKEND_VKCOM)
16311641
initVkComBackend();
16321642
else if (preferableBackend == DNN_BACKEND_CUDA)
1633-
initCUDABackend();
1643+
initCUDABackend(blobsToKeep_);
16341644
else
16351645
CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
16361646
}
@@ -2360,7 +2370,7 @@ struct Net::Impl : public detail::NetImplBase
23602370
#endif
23612371
}
23622372

2363-
void initCUDABackend() {
2373+
void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_) {
23642374
CV_Assert(haveCUDA());
23652375

23662376
#ifdef HAVE_CUDA
@@ -2386,6 +2396,15 @@ struct Net::Impl : public detail::NetImplBase
23862396
auto cudaNode = node.dynamicCast<CUDABackendNode>();
23872397
cudaInfo->workspace.require(cudaNode->get_workspace_memory_in_bytes());
23882398
}
2399+
2400+
if (blobsToKeep_.size() > 1)
2401+
{
2402+
for (const auto& pin : blobsToKeep_)
2403+
{
2404+
LayerData& ld = layers[pin.lid];
2405+
ld.cudaD2HBackgroundTransfers.push_back(pin.oid);
2406+
}
2407+
}
23892408
#endif
23902409
}
23912410

@@ -3126,6 +3145,12 @@ struct Net::Impl : public detail::NetImplBase
31263145
CV_Assert(!cudaNode.empty());
31273146

31283147
cudaNode->forward(ld.inputBlobsWrappers, ld.outputBlobsWrappers, cudaInfo->workspace);
3148+
3149+
for (auto id : ld.cudaD2HBackgroundTransfers)
3150+
{
3151+
auto wrapper = ld.outputBlobsWrappers[id].dynamicCast<CUDABackendWrapper>();
3152+
wrapper->copyToHostInBackground();
3153+
}
31293154
#endif
31303155
}
31313156
else if (preferableBackend == DNN_BACKEND_HALIDE)

modules/dnn/src/op_cuda.hpp

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#ifdef HAVE_CUDA
99
#include "cuda4dnn/csl/stream.hpp"
10+
#include "cuda4dnn/csl/event.hpp"
1011
#include "cuda4dnn/csl/cublas.hpp"
1112
#include "cuda4dnn/csl/cudnn.hpp"
1213
#include "cuda4dnn/csl/tensor.hpp"
@@ -206,6 +207,7 @@ namespace cv { namespace dnn {
206207
virtual ~CUDABackendWrapper() { }
207208

208209
void copyToHost() override = 0;
210+
virtual void copyToHostInBackground() = 0;
209211
void setHostDirty() override = 0;
210212

211213
virtual void copyToDevice() = 0;
@@ -215,7 +217,7 @@ namespace cv { namespace dnn {
215217
virtual std::size_t getRank() const noexcept = 0;
216218

217219
/** @note setting the stream updates the stream for all wrappers which use the same tensor */
218-
virtual void setStream(cuda4dnn::csl::Stream stream) noexcept = 0;
220+
virtual void setStream(cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream h2d_stream) noexcept = 0;
219221

220222
virtual void update(const MatShape& shape, std::size_t offset) = 0;
221223
};
@@ -240,6 +242,36 @@ namespace cv { namespace dnn {
240242
cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), view.data(), view.size(), stream);
241243
}
242244

245+
template <class U>
246+
void convert_D2H_background(const cv::Mat& mat, cuda4dnn::csl::View<U> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event);
247+
248+
template <> inline
249+
void convert_D2H_background<half>(const cv::Mat& mat, cuda4dnn::csl::View<half> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) {
250+
if (device_temp.size() < view.size())
251+
device_temp.reset(view.size());
252+
auto temp_span = cuda4dnn::csl::Span<float>(device_temp.get(), view.size());
253+
254+
/* The conversion kernel should can be executed in the background stream for better
255+
* performance. We do it in the inference stream to prevent an unexplained performance
256+
* regression on RTX 2080 Ti. Executing conversion kernel in the background stream causes
257+
* everything to slow down (even operations that appear before the background transfer).
258+
*
259+
* TODO: identify the cause and move conversion kernel to the background stream
260+
*/
261+
cuda4dnn::kernels::fp16_to_fp32(stream, temp_span, view);
262+
263+
d2h_event.record(stream); // mark position in inference stream
264+
cuda4dnn::csl::StreamWaitOnEvent(d2h_stream, d2h_event); // don't start transfer until data is available
265+
cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), temp_span.data(), view.size(), d2h_stream);
266+
}
267+
268+
template <> inline
269+
void convert_D2H_background<float>(const cv::Mat& mat, cuda4dnn::csl::View<float> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) {
270+
d2h_event.record(stream);
271+
cuda4dnn::csl::StreamWaitOnEvent(d2h_stream, d2h_event);
272+
cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), view.data(), view.size(), d2h_stream);
273+
}
274+
243275
template <class U>
244276
void convert_H2D(cuda4dnn::csl::Span<U> span, const cv::Mat& mat, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream);
245277

@@ -349,6 +381,28 @@ namespace cv { namespace dnn {
349381

350382
cuda4dnn::detail::convert_D2H<T>(mat, view, shared_block->device_temp, shared_block->stream);
351383
shared_block->stream.synchronize();
384+
} else if(shared_block->d2h_event && shared_block->d2h_event.busy()) {
385+
/* wait for the background copy to finish */
386+
shared_block->d2h_event.synchronize();
387+
}
388+
}
389+
390+
void copyToHostInBackground() override {
391+
CV_Assert(shared_block->d2h_stream);
392+
if (shared_block->device_dirty) {
393+
shared_block->host_dirty = false;
394+
shared_block->device_dirty = false;
395+
396+
auto view = tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
397+
398+
auto& mat = shared_block->host;
399+
CV_Assert(mat.isContinuous());
400+
CV_Assert(mat.type() == CV_32F);
401+
402+
if (!shared_block->d2h_event)
403+
shared_block->d2h_event = cuda4dnn::csl::Event(true);
404+
cuda4dnn::detail::convert_D2H_background<T>(mat, view, shared_block->device_temp, shared_block->stream, shared_block->d2h_stream, shared_block->d2h_event);
405+
shared_block->d2h_event.record(shared_block->d2h_stream); // record position so that we can check status later
352406
}
353407
}
354408

@@ -383,8 +437,9 @@ namespace cv { namespace dnn {
383437

384438
std::size_t getRank() const noexcept override { return shape.size(); }
385439

386-
void setStream(cuda4dnn::csl::Stream stream) noexcept override {
440+
void setStream(cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream d2h_stream) noexcept override {
387441
shared_block->stream = std::move(stream);
442+
shared_block->d2h_stream = std::move(d2h_stream);
388443
}
389444

390445
void update(const MatShape& shape_, std::size_t offset_) override {
@@ -452,6 +507,9 @@ namespace cv { namespace dnn {
452507
cuda4dnn::csl::ManagedPtr<T> device;
453508
cuda4dnn::csl::ManagedPtr<float> device_temp; /* use for conversions */
454509
cuda4dnn::csl::Stream stream;
510+
511+
cuda4dnn::csl::Event d2h_event;
512+
cuda4dnn::csl::Stream d2h_stream;
455513
};
456514

457515
std::shared_ptr<shared_block_type> shared_block;

0 commit comments

Comments
 (0)