@@ -49,8 +49,6 @@ namespace cv { namespace cuda { namespace device
49
49
{
50
50
namespace fast
51
51
{
52
- __device__ unsigned int g_counter = 0 ;
53
-
54
52
// /////////////////////////////////////////////////////////////////////////
55
53
// calcKeypoints
56
54
@@ -218,7 +216,7 @@ namespace cv { namespace cuda { namespace device
218
216
}
219
217
220
218
template <bool calcScore, class Mask >
221
- __global__ void calcKeypoints (const PtrStepSzb img, const Mask mask, short2 * kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
219
+ __global__ void calcKeypoints (const PtrStepSzb img, const Mask mask, short2 * kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold, unsigned int * d_counter )
222
220
{
223
221
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
224
222
@@ -269,7 +267,7 @@ namespace cv { namespace cuda { namespace device
269
267
{
270
268
if (calcScore) score (i, j) = cornerScore (C, v, threshold);
271
269
272
- const unsigned int ind = atomicInc (&g_counter , (unsigned int )(-1 ));
270
+ const unsigned int ind = atomicInc (d_counter , (unsigned int )(-1 ));
273
271
274
272
if (ind < maxKeypoints)
275
273
kpLoc[ind] = make_short2 (j, i);
@@ -279,38 +277,35 @@ namespace cv { namespace cuda { namespace device
279
277
#endif
280
278
}
281
279
282
- int calcKeypoints_gpu (PtrStepSzb img, PtrStepSzb mask, short2 * kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream)
280
+ int calcKeypoints_gpu (PtrStepSzb img, PtrStepSzb mask, short2 * kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int * d_counter, cudaStream_t stream)
283
281
{
284
- void * counter_ptr;
285
- cudaSafeCall ( cudaGetSymbolAddress (&counter_ptr, g_counter) );
286
-
287
282
dim3 block (32 , 8 );
288
283
289
284
dim3 grid;
290
285
grid.x = divUp (img.cols - 6 , block.x );
291
286
grid.y = divUp (img.rows - 6 , block.y );
292
287
293
- cudaSafeCall ( cudaMemsetAsync (counter_ptr , 0 , sizeof (unsigned int ), stream) );
288
+ cudaSafeCall ( cudaMemsetAsync (d_counter , 0 , sizeof (unsigned int ), stream) );
294
289
295
290
if (score.data )
296
291
{
297
292
if (mask.data )
298
- calcKeypoints<true ><<<grid, block, 0 , stream>>> (img, SingleMask (mask), kpLoc, maxKeypoints, score, threshold);
293
+ calcKeypoints<true ><<<grid, block, 0 , stream>>> (img, SingleMask (mask), kpLoc, maxKeypoints, score, threshold, d_counter );
299
294
else
300
- calcKeypoints<true ><<<grid, block, 0 , stream>>> (img, WithOutMask (), kpLoc, maxKeypoints, score, threshold);
295
+ calcKeypoints<true ><<<grid, block, 0 , stream>>> (img, WithOutMask (), kpLoc, maxKeypoints, score, threshold, d_counter );
301
296
}
302
297
else
303
298
{
304
299
if (mask.data )
305
- calcKeypoints<false ><<<grid, block, 0 , stream>>> (img, SingleMask (mask), kpLoc, maxKeypoints, score, threshold);
300
+ calcKeypoints<false ><<<grid, block, 0 , stream>>> (img, SingleMask (mask), kpLoc, maxKeypoints, score, threshold, d_counter );
306
301
else
307
- calcKeypoints<false ><<<grid, block, 0 , stream>>> (img, WithOutMask (), kpLoc, maxKeypoints, score, threshold);
302
+ calcKeypoints<false ><<<grid, block, 0 , stream>>> (img, WithOutMask (), kpLoc, maxKeypoints, score, threshold, d_counter );
308
303
}
309
304
310
305
cudaSafeCall ( cudaGetLastError () );
311
306
312
307
unsigned int count;
313
- cudaSafeCall ( cudaMemcpyAsync (&count, counter_ptr , sizeof (unsigned int ), cudaMemcpyDeviceToHost, stream) );
308
+ cudaSafeCall ( cudaMemcpyAsync (&count, d_counter , sizeof (unsigned int ), cudaMemcpyDeviceToHost, stream) );
314
309
315
310
cudaSafeCall ( cudaStreamSynchronize (stream) );
316
311
@@ -320,7 +315,7 @@ namespace cv { namespace cuda { namespace device
320
315
// /////////////////////////////////////////////////////////////////////////
321
316
// nonmaxSuppression
322
317
323
- __global__ void nonmaxSuppression (const short2 * kpLoc, int count, const PtrStepSzi scoreMat, short2 * locFinal, float * responseFinal)
318
+ __global__ void nonmaxSuppression (const short2 * kpLoc, int count, const PtrStepSzi scoreMat, short2 * locFinal, float * responseFinal, unsigned int * d_counter )
324
319
{
325
320
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
326
321
@@ -346,7 +341,7 @@ namespace cv { namespace cuda { namespace device
346
341
347
342
if (ismax)
348
343
{
349
- const unsigned int ind = atomicInc (&g_counter , (unsigned int )(-1 ));
344
+ const unsigned int ind = atomicInc (d_counter , (unsigned int )(-1 ));
350
345
351
346
locFinal[ind] = loc;
352
347
responseFinal[ind] = static_cast <float >(score);
@@ -356,23 +351,20 @@ namespace cv { namespace cuda { namespace device
356
351
#endif
357
352
}
358
353
359
- int nonmaxSuppression_gpu (const short2 * kpLoc, int count, PtrStepSzi score, short2 * loc, float * response, cudaStream_t stream)
354
+ int nonmaxSuppression_gpu (const short2 * kpLoc, int count, PtrStepSzi score, short2 * loc, float * response, unsigned int * d_counter, cudaStream_t stream)
360
355
{
361
- void * counter_ptr;
362
- cudaSafeCall ( cudaGetSymbolAddress (&counter_ptr, g_counter) );
363
-
364
356
dim3 block (256 );
365
357
366
358
dim3 grid;
367
359
grid.x = divUp (count, block.x );
368
360
369
- cudaSafeCall ( cudaMemsetAsync (counter_ptr , 0 , sizeof (unsigned int ), stream) );
361
+ cudaSafeCall ( cudaMemsetAsync (d_counter , 0 , sizeof (unsigned int ), stream) );
370
362
371
- nonmaxSuppression<<<grid, block, 0 , stream>>> (kpLoc, count, score, loc, response);
363
+ nonmaxSuppression<<<grid, block, 0 , stream>>> (kpLoc, count, score, loc, response, d_counter );
372
364
cudaSafeCall ( cudaGetLastError () );
373
365
374
366
unsigned int new_count;
375
- cudaSafeCall ( cudaMemcpyAsync (&new_count, counter_ptr , sizeof (unsigned int ), cudaMemcpyDeviceToHost, stream) );
367
+ cudaSafeCall ( cudaMemcpyAsync (&new_count, d_counter , sizeof (unsigned int ), cudaMemcpyDeviceToHost, stream) );
376
368
377
369
cudaSafeCall ( cudaStreamSynchronize (stream) );
378
370
0 commit comments