Merge pull request opencv#10906 from nglee:dev_cudaFastMultiStreamSafety

nglee · alalek · commit 12ce6b5c1557 · 2018-02-20T20:55:57.000+03:00
cuda_fast : multi stream safety (opencv#10906) * CUDA_Features2D/FAST Asynchronous test * cuda_fast : multi stream safety * Use parallel_for instead of OpenMP
diff --git a/modules/cudafeatures2d/src/cuda/fast.cu b/modules/cudafeatures2d/src/cuda/fast.cu
@@ -49,8 +49,6 @@ namespace cv { namespace cuda { namespace device
 {
     namespace fast
     {
-        __device__ unsigned int g_counter = 0;
-
         ///////////////////////////////////////////////////////////////////////////
         // calcKeypoints
 
@@ -218,7 +216,7 @@ namespace cv { namespace cuda { namespace device
         }
 
         template <bool calcScore, class Mask>
-        __global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
+        __global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold, unsigned int* d_counter)
         {
             #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 
@@ -269,7 +267,7 @@ namespace cv { namespace cuda { namespace device
                 {
                     if (calcScore) score(i, j) = cornerScore(C, v, threshold);
 
-                    const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1));
+                    const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1));
 
                     if (ind < maxKeypoints)
                         kpLoc[ind] = make_short2(j, i);
@@ -279,38 +277,35 @@ namespace cv { namespace cuda { namespace device
             #endif
         }
 
-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream)
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream)
         {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
-
             dim3 block(32, 8);
 
             dim3 grid;
             grid.x = divUp(img.cols - 6, block.x);
             grid.y = divUp(img.rows - 6, block.y);
 
-            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
+            cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) );
 
             if (score.data)
             {
                 if (mask.data)
-                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter);
                 else
-                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter);
             }
             else
             {
                 if (mask.data)
-                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter);
                 else
-                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter);
             }
 
             cudaSafeCall( cudaGetLastError() );
 
             unsigned int count;
-            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
 
             cudaSafeCall( cudaStreamSynchronize(stream) );
 
@@ -320,7 +315,7 @@ namespace cv { namespace cuda { namespace device
         ///////////////////////////////////////////////////////////////////////////
         // nonmaxSuppression
 
-        __global__ void nonmaxSuppression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal)
+        __global__ void nonmaxSuppression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal, unsigned int* d_counter)
         {
             #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 
@@ -346,7 +341,7 @@ namespace cv { namespace cuda { namespace device
 
                 if (ismax)
                 {
-                    const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1));
+                    const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1));
 
                     locFinal[ind] = loc;
                     responseFinal[ind] = static_cast<float>(score);
@@ -356,23 +351,20 @@ namespace cv { namespace cuda { namespace device
             #endif
         }
 
-        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream)
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream)
         {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
-
             dim3 block(256);
 
             dim3 grid;
             grid.x = divUp(count, block.x);
 
-            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
+            cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) );
 
-            nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response);
+            nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response, d_counter);
             cudaSafeCall( cudaGetLastError() );
 
             unsigned int new_count;
-            cudaSafeCall( cudaMemcpyAsync(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaMemcpyAsync(&new_count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
 
             cudaSafeCall( cudaStreamSynchronize(stream) );
 
diff --git a/modules/cudafeatures2d/src/fast.cpp b/modules/cudafeatures2d/src/fast.cpp
@@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
 {
     namespace fast
     {
-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream);
-        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream);
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream);
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream);
     }
 }}}
 
@@ -88,6 +88,8 @@ namespace
         int threshold_;
         bool nonmaxSuppression_;
         int max_npoints_;
+
+        unsigned int* d_counter;
     };
 
     FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) :
@@ -114,6 +116,8 @@ namespace
     {
         using namespace cv::cuda::device::fast;
 
+        cudaSafeCall( cudaMalloc(&d_counter, sizeof(unsigned int)) );
+
         const GpuMat img = _image.getGpuMat();
         const GpuMat mask = _mask.getGpuMat();
 
@@ -131,7 +135,7 @@ namespace
             score.setTo(Scalar::all(0), stream);
         }
 
-        int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, StreamAccessor::getStream(stream));
+        int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, d_counter, StreamAccessor::getStream(stream));
         count = std::min(count, max_npoints_);
 
         if (count == 0)
@@ -145,7 +149,7 @@ namespace
 
         if (nonmaxSuppression_)
         {
-            count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), StreamAccessor::getStream(stream));
+            count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), d_counter, StreamAccessor::getStream(stream));
             if (count == 0)
             {
                 keypoints.release();
@@ -161,6 +165,8 @@ namespace
             kpLoc.colRange(0, count).copyTo(locRow, stream);
             keypoints.row(1).setTo(Scalar::all(0), stream);
         }
+
+        cudaSafeCall( cudaFree(d_counter) );
     }
 
     void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
diff --git a/modules/cudafeatures2d/test/test_features2d.cpp b/modules/cudafeatures2d/test/test_features2d.cpp
@@ -44,6 +44,8 @@
 
 #ifdef HAVE_CUDA
 
+#include <cuda_runtime_api.h>
+
 namespace opencv_test { namespace {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -80,15 +82,7 @@ CUDA_TEST_P(FAST, Accuracy)
 
     if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
     {
-        try
-        {
-            std::vector<cv::KeyPoint> keypoints;
-            fast->detect(loadMat(image), keypoints);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
-        }
+        throw SkipTestException("CUDA device doesn't support global atomics");
     }
     else
     {
@@ -102,6 +96,62 @@ CUDA_TEST_P(FAST, Accuracy)
     }
 }
 
+class FastAsyncParallelLoopBody : public cv::ParallelLoopBody
+{
+public:
+    FastAsyncParallelLoopBody(cv::cuda::HostMem& src, cv::cuda::GpuMat* d_kpts, cv::Ptr<cv::cuda::FastFeatureDetector>* d_fast)
+        : src_(src), kpts_(d_kpts), fast_(d_fast) {}
+    ~FastAsyncParallelLoopBody() {};
+    void operator()(const cv::Range& r) const
+    {
+        for (int i = r.start; i < r.end; i++) {
+            cv::cuda::Stream stream;
+            cv::cuda::GpuMat d_src_(src_.rows, src_.cols, CV_8UC1);
+            d_src_.upload(src_);
+            fast_[i]->detectAsync(d_src_, kpts_[i], noArray(), stream);
+        }
+    }
+protected:
+    cv::cuda::HostMem src_;
+    cv::cuda::GpuMat* kpts_;
+    cv::Ptr<cv::cuda::FastFeatureDetector>* fast_;
+};
+
+CUDA_TEST_P(FAST, Async)
+{
+    if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
+    {
+        throw SkipTestException("CUDA device doesn't support global atomics");
+    }
+    else
+    {
+        cv::Mat image_ = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(image_.empty());
+
+        cv::cuda::HostMem image(image_);
+
+        cv::cuda::GpuMat d_keypoints[2];
+        cv::Ptr<cv::cuda::FastFeatureDetector> d_fast[2];
+
+        d_fast[0] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
+        d_fast[1] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
+
+        cv::parallel_for_(cv::Range(0, 2), FastAsyncParallelLoopBody(image, d_keypoints, d_fast));
+
+        cudaDeviceSynchronize();
+
+        std::vector<cv::KeyPoint> keypoints[2];
+        d_fast[0]->convert(d_keypoints[0], keypoints[0]);
+        d_fast[1]->convert(d_keypoints[1], keypoints[1]);
+
+        std::vector<cv::KeyPoint> keypoints_gold;
+        cv::FAST(image, keypoints_gold, threshold, nonmaxSuppression);
+
+        ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[0]);
+        ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[1]);
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Features2D, FAST, testing::Combine(
     ALL_DEVICES,
     testing::Values(FAST_Threshold(25), FAST_Threshold(50)),

Original file line number	Diff line number	Diff line change
`@@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device`
`55`	`55`	`{`
`56`	`56`	`namespace fast`
`57`	`57`	`{`
`58`		`- int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream);`
`59`		`- int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream);`
	`58`	`+ int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream);`
	`59`	`+ int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream);`
`60`	`60`	`}`
`61`	`61`	`}}}`
`62`	`62`
`@@ -88,6 +88,8 @@ namespace`
`88`	`88`	`int threshold_;`
`89`	`89`	`bool nonmaxSuppression_;`
`90`	`90`	`int max_npoints_;`
	`91`	`+`
	`92`	`+ unsigned int* d_counter;`
`91`	`93`	`};`
`92`	`94`
`93`	`95`	`FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) :`
`@@ -114,6 +116,8 @@ namespace`
`114`	`116`	`{`
`115`	`117`	`using namespace cv::cuda::device::fast;`
`116`	`118`
	`119`	`+ cudaSafeCall( cudaMalloc(&d_counter, sizeof(unsigned int)) );`
	`120`	`+`
`117`	`121`	`const GpuMat img = _image.getGpuMat();`
`118`	`122`	`const GpuMat mask = _mask.getGpuMat();`
`119`	`123`
`@@ -131,7 +135,7 @@ namespace`
`131`	`135`	`score.setTo(Scalar::all(0), stream);`
`132`	`136`	`}`
`133`	`137`
`134`		`- int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, StreamAccessor::getStream(stream));`
	`138`	`+ int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, d_counter, StreamAccessor::getStream(stream));`
`135`	`139`	`count = std::min(count, max_npoints_);`
`136`	`140`
`137`	`141`	`if (count == 0)`
`@@ -145,7 +149,7 @@ namespace`
`145`	`149`
`146`	`150`	`if (nonmaxSuppression_)`
`147`	`151`	`{`
`148`		`- count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), StreamAccessor::getStream(stream));`
	`152`	`+ count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), d_counter, StreamAccessor::getStream(stream));`
`149`	`153`	`if (count == 0)`
`150`	`154`	`{`
`151`	`155`	`keypoints.release();`
`@@ -161,6 +165,8 @@ namespace`
`161`	`165`	`kpLoc.colRange(0, count).copyTo(locRow, stream);`
`162`	`166`	`keypoints.row(1).setTo(Scalar::all(0), stream);`
`163`	`167`	`}`
	`168`	`+`
	`169`	`+ cudaSafeCall( cudaFree(d_counter) );`
`164`	`170`	`}`
`165`	`171`
`166`	`172`	`void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)`