7
7
8
8
#ifdef HAVE_CUDA
9
9
#include " cuda4dnn/csl/stream.hpp"
10
+ #include " cuda4dnn/csl/event.hpp"
10
11
#include " cuda4dnn/csl/cublas.hpp"
11
12
#include " cuda4dnn/csl/cudnn.hpp"
12
13
#include " cuda4dnn/csl/tensor.hpp"
@@ -206,6 +207,7 @@ namespace cv { namespace dnn {
206
207
virtual ~CUDABackendWrapper () { }
207
208
208
209
void copyToHost () override = 0;
210
+ virtual void copyToHostInBackground () = 0;
209
211
void setHostDirty () override = 0;
210
212
211
213
virtual void copyToDevice () = 0;
@@ -215,7 +217,7 @@ namespace cv { namespace dnn {
215
217
virtual std::size_t getRank () const noexcept = 0;
216
218
217
219
/* * @note setting the stream updates the stream for all wrappers which use the same tensor */
218
- virtual void setStream (cuda4dnn::csl::Stream stream) noexcept = 0;
220
+ virtual void setStream (cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream h2d_stream ) noexcept = 0;
219
221
220
222
virtual void update (const MatShape& shape, std::size_t offset) = 0;
221
223
};
@@ -240,6 +242,36 @@ namespace cv { namespace dnn {
240
242
cuda4dnn::csl::memcpy<float >(reinterpret_cast <float *>(mat.data ), view.data (), view.size (), stream);
241
243
}
242
244
245
+ template <class U >
246
+ void convert_D2H_background (const cv::Mat& mat, cuda4dnn::csl::View<U> view, cuda4dnn::csl::ManagedPtr<float >& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event);
247
+
248
+ template <> inline
249
+ void convert_D2H_background<half>(const cv::Mat& mat, cuda4dnn::csl::View<half> view, cuda4dnn::csl::ManagedPtr<float >& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) {
250
+ if (device_temp.size () < view.size ())
251
+ device_temp.reset (view.size ());
252
+ auto temp_span = cuda4dnn::csl::Span<float >(device_temp.get (), view.size ());
253
+
254
+ /* The conversion kernel should can be executed in the background stream for better
255
+ * performance. We do it in the inference stream to prevent an unexplained performance
256
+ * regression on RTX 2080 Ti. Executing conversion kernel in the background stream causes
257
+ * everything to slow down (even operations that appear before the background transfer).
258
+ *
259
+ * TODO: identify the cause and move conversion kernel to the background stream
260
+ */
261
+ cuda4dnn::kernels::fp16_to_fp32 (stream, temp_span, view);
262
+
263
+ d2h_event.record (stream); // mark position in inference stream
264
+ cuda4dnn::csl::StreamWaitOnEvent (d2h_stream, d2h_event); // don't start transfer until data is available
265
+ cuda4dnn::csl::memcpy<float >(reinterpret_cast <float *>(mat.data ), temp_span.data (), view.size (), d2h_stream);
266
+ }
267
+
268
+ template <> inline
269
+ void convert_D2H_background<float >(const cv::Mat& mat, cuda4dnn::csl::View<float > view, cuda4dnn::csl::ManagedPtr<float >& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) {
270
+ d2h_event.record (stream);
271
+ cuda4dnn::csl::StreamWaitOnEvent (d2h_stream, d2h_event);
272
+ cuda4dnn::csl::memcpy<float >(reinterpret_cast <float *>(mat.data ), view.data (), view.size (), d2h_stream);
273
+ }
274
+
243
275
template <class U >
244
276
void convert_H2D (cuda4dnn::csl::Span<U> span, const cv::Mat& mat, cuda4dnn::csl::ManagedPtr<float >& device_temp, const cuda4dnn::csl::Stream& stream);
245
277
@@ -349,6 +381,28 @@ namespace cv { namespace dnn {
349
381
350
382
cuda4dnn::detail::convert_D2H<T>(mat, view, shared_block->device_temp , shared_block->stream );
351
383
shared_block->stream .synchronize ();
384
+ } else if (shared_block->d2h_event && shared_block->d2h_event .busy ()) {
385
+ /* wait for the background copy to finish */
386
+ shared_block->d2h_event .synchronize ();
387
+ }
388
+ }
389
+
390
+ void copyToHostInBackground () override {
391
+ CV_Assert (shared_block->d2h_stream );
392
+ if (shared_block->device_dirty ) {
393
+ shared_block->host_dirty = false ;
394
+ shared_block->device_dirty = false ;
395
+
396
+ auto view = tensor_view_type (shared_block->device .get (), std::begin (shape), std::end (shape));
397
+
398
+ auto & mat = shared_block->host ;
399
+ CV_Assert (mat.isContinuous ());
400
+ CV_Assert (mat.type () == CV_32F);
401
+
402
+ if (!shared_block->d2h_event )
403
+ shared_block->d2h_event = cuda4dnn::csl::Event (true );
404
+ cuda4dnn::detail::convert_D2H_background<T>(mat, view, shared_block->device_temp , shared_block->stream , shared_block->d2h_stream , shared_block->d2h_event );
405
+ shared_block->d2h_event .record (shared_block->d2h_stream ); // record position so that we can check status later
352
406
}
353
407
}
354
408
@@ -383,8 +437,9 @@ namespace cv { namespace dnn {
383
437
384
438
std::size_t getRank () const noexcept override { return shape.size (); }
385
439
386
- void setStream (cuda4dnn::csl::Stream stream) noexcept override {
440
+ void setStream (cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream d2h_stream ) noexcept override {
387
441
shared_block->stream = std::move (stream);
442
+ shared_block->d2h_stream = std::move (d2h_stream);
388
443
}
389
444
390
445
void update (const MatShape& shape_, std::size_t offset_) override {
@@ -452,6 +507,9 @@ namespace cv { namespace dnn {
452
507
cuda4dnn::csl::ManagedPtr<T> device;
453
508
cuda4dnn::csl::ManagedPtr<float > device_temp; /* use for conversions */
454
509
cuda4dnn::csl::Stream stream;
510
+
511
+ cuda4dnn::csl::Event d2h_event;
512
+ cuda4dnn::csl::Stream d2h_stream;
455
513
};
456
514
457
515
std::shared_ptr<shared_block_type> shared_block;
0 commit comments