Skip to content

Commit 12ce6b5

Browse files
ngleealalek
authored andcommitted
Merge pull request opencv#10906 from nglee:dev_cudaFastMultiStreamSafety
cuda_fast : multi stream safety (opencv#10906) * CUDA_Features2D/FAST Asynchronous test * cuda_fast : multi stream safety * Use parallel_for instead of OpenMP
1 parent c6e1e3a commit 12ce6b5

File tree

3 files changed

+84
-36
lines changed

3 files changed

+84
-36
lines changed

modules/cudafeatures2d/src/cuda/fast.cu

Lines changed: 15 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,6 @@ namespace cv { namespace cuda { namespace device
4949
{
5050
namespace fast
5151
{
52-
__device__ unsigned int g_counter = 0;
53-
5452
///////////////////////////////////////////////////////////////////////////
5553
// calcKeypoints
5654

@@ -218,7 +216,7 @@ namespace cv { namespace cuda { namespace device
218216
}
219217

220218
template <bool calcScore, class Mask>
221-
__global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
219+
__global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold, unsigned int* d_counter)
222220
{
223221
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
224222

@@ -269,7 +267,7 @@ namespace cv { namespace cuda { namespace device
269267
{
270268
if (calcScore) score(i, j) = cornerScore(C, v, threshold);
271269

272-
const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1));
270+
const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1));
273271

274272
if (ind < maxKeypoints)
275273
kpLoc[ind] = make_short2(j, i);
@@ -279,38 +277,35 @@ namespace cv { namespace cuda { namespace device
279277
#endif
280278
}
281279

282-
int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream)
280+
int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream)
283281
{
284-
void* counter_ptr;
285-
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
286-
287282
dim3 block(32, 8);
288283

289284
dim3 grid;
290285
grid.x = divUp(img.cols - 6, block.x);
291286
grid.y = divUp(img.rows - 6, block.y);
292287

293-
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
288+
cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) );
294289

295290
if (score.data)
296291
{
297292
if (mask.data)
298-
calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
293+
calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter);
299294
else
300-
calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
295+
calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter);
301296
}
302297
else
303298
{
304299
if (mask.data)
305-
calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
300+
calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter);
306301
else
307-
calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
302+
calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter);
308303
}
309304

310305
cudaSafeCall( cudaGetLastError() );
311306

312307
unsigned int count;
313-
cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
308+
cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
314309

315310
cudaSafeCall( cudaStreamSynchronize(stream) );
316311

@@ -320,7 +315,7 @@ namespace cv { namespace cuda { namespace device
320315
///////////////////////////////////////////////////////////////////////////
321316
// nonmaxSuppression
322317

323-
__global__ void nonmaxSuppression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal)
318+
__global__ void nonmaxSuppression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal, unsigned int* d_counter)
324319
{
325320
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
326321

@@ -346,7 +341,7 @@ namespace cv { namespace cuda { namespace device
346341

347342
if (ismax)
348343
{
349-
const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1));
344+
const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1));
350345

351346
locFinal[ind] = loc;
352347
responseFinal[ind] = static_cast<float>(score);
@@ -356,23 +351,20 @@ namespace cv { namespace cuda { namespace device
356351
#endif
357352
}
358353

359-
int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream)
354+
int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream)
360355
{
361-
void* counter_ptr;
362-
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
363-
364356
dim3 block(256);
365357

366358
dim3 grid;
367359
grid.x = divUp(count, block.x);
368360

369-
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
361+
cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) );
370362

371-
nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response);
363+
nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response, d_counter);
372364
cudaSafeCall( cudaGetLastError() );
373365

374366
unsigned int new_count;
375-
cudaSafeCall( cudaMemcpyAsync(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
367+
cudaSafeCall( cudaMemcpyAsync(&new_count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
376368

377369
cudaSafeCall( cudaStreamSynchronize(stream) );
378370

modules/cudafeatures2d/src/fast.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
5555
{
5656
namespace fast
5757
{
58-
int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream);
59-
int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream);
58+
int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream);
59+
int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream);
6060
}
6161
}}}
6262

@@ -88,6 +88,8 @@ namespace
8888
int threshold_;
8989
bool nonmaxSuppression_;
9090
int max_npoints_;
91+
92+
unsigned int* d_counter;
9193
};
9294

9395
FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) :
@@ -114,6 +116,8 @@ namespace
114116
{
115117
using namespace cv::cuda::device::fast;
116118

119+
cudaSafeCall( cudaMalloc(&d_counter, sizeof(unsigned int)) );
120+
117121
const GpuMat img = _image.getGpuMat();
118122
const GpuMat mask = _mask.getGpuMat();
119123

@@ -131,7 +135,7 @@ namespace
131135
score.setTo(Scalar::all(0), stream);
132136
}
133137

134-
int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, StreamAccessor::getStream(stream));
138+
int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, d_counter, StreamAccessor::getStream(stream));
135139
count = std::min(count, max_npoints_);
136140

137141
if (count == 0)
@@ -145,7 +149,7 @@ namespace
145149

146150
if (nonmaxSuppression_)
147151
{
148-
count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), StreamAccessor::getStream(stream));
152+
count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), d_counter, StreamAccessor::getStream(stream));
149153
if (count == 0)
150154
{
151155
keypoints.release();
@@ -161,6 +165,8 @@ namespace
161165
kpLoc.colRange(0, count).copyTo(locRow, stream);
162166
keypoints.row(1).setTo(Scalar::all(0), stream);
163167
}
168+
169+
cudaSafeCall( cudaFree(d_counter) );
164170
}
165171

166172
void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)

modules/cudafeatures2d/test/test_features2d.cpp

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444

4545
#ifdef HAVE_CUDA
4646

47+
#include <cuda_runtime_api.h>
48+
4749
namespace opencv_test { namespace {
4850

4951
/////////////////////////////////////////////////////////////////////////////////////////////////
@@ -80,15 +82,7 @@ CUDA_TEST_P(FAST, Accuracy)
8082

8183
if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
8284
{
83-
try
84-
{
85-
std::vector<cv::KeyPoint> keypoints;
86-
fast->detect(loadMat(image), keypoints);
87-
}
88-
catch (const cv::Exception& e)
89-
{
90-
ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
91-
}
85+
throw SkipTestException("CUDA device doesn't support global atomics");
9286
}
9387
else
9488
{
@@ -102,6 +96,62 @@ CUDA_TEST_P(FAST, Accuracy)
10296
}
10397
}
10498

99+
class FastAsyncParallelLoopBody : public cv::ParallelLoopBody
100+
{
101+
public:
102+
FastAsyncParallelLoopBody(cv::cuda::HostMem& src, cv::cuda::GpuMat* d_kpts, cv::Ptr<cv::cuda::FastFeatureDetector>* d_fast)
103+
: src_(src), kpts_(d_kpts), fast_(d_fast) {}
104+
~FastAsyncParallelLoopBody() {};
105+
void operator()(const cv::Range& r) const
106+
{
107+
for (int i = r.start; i < r.end; i++) {
108+
cv::cuda::Stream stream;
109+
cv::cuda::GpuMat d_src_(src_.rows, src_.cols, CV_8UC1);
110+
d_src_.upload(src_);
111+
fast_[i]->detectAsync(d_src_, kpts_[i], noArray(), stream);
112+
}
113+
}
114+
protected:
115+
cv::cuda::HostMem src_;
116+
cv::cuda::GpuMat* kpts_;
117+
cv::Ptr<cv::cuda::FastFeatureDetector>* fast_;
118+
};
119+
120+
CUDA_TEST_P(FAST, Async)
121+
{
122+
if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
123+
{
124+
throw SkipTestException("CUDA device doesn't support global atomics");
125+
}
126+
else
127+
{
128+
cv::Mat image_ = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
129+
ASSERT_FALSE(image_.empty());
130+
131+
cv::cuda::HostMem image(image_);
132+
133+
cv::cuda::GpuMat d_keypoints[2];
134+
cv::Ptr<cv::cuda::FastFeatureDetector> d_fast[2];
135+
136+
d_fast[0] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
137+
d_fast[1] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
138+
139+
cv::parallel_for_(cv::Range(0, 2), FastAsyncParallelLoopBody(image, d_keypoints, d_fast));
140+
141+
cudaDeviceSynchronize();
142+
143+
std::vector<cv::KeyPoint> keypoints[2];
144+
d_fast[0]->convert(d_keypoints[0], keypoints[0]);
145+
d_fast[1]->convert(d_keypoints[1], keypoints[1]);
146+
147+
std::vector<cv::KeyPoint> keypoints_gold;
148+
cv::FAST(image, keypoints_gold, threshold, nonmaxSuppression);
149+
150+
ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[0]);
151+
ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[1]);
152+
}
153+
}
154+
105155
INSTANTIATE_TEST_CASE_P(CUDA_Features2D, FAST, testing::Combine(
106156
ALL_DEVICES,
107157
testing::Values(FAST_Threshold(25), FAST_Threshold(50)),

0 commit comments

Comments
 (0)