@@ -34,9 +34,9 @@ namespace plugin {
3434static constexpr int kNumCUDAThreads = 512 ;
3535static constexpr int kNumMaximumNumBlocks = 4096 ;
3636
37- static inline int NumBlocks (const int64_t N) {
38- return std::min< int64_t > ((N + kNumCUDAThreads - 1 ) / kNumCUDAThreads ,
39- static_cast < int64_t >( kNumMaximumNumBlocks ) );
37+ static inline int NumBlocks (const int N) {
38+ return std::min ((N + kNumCUDAThreads - 1 ) / kNumCUDAThreads ,
39+ kNumMaximumNumBlocks );
4040}
4141
4242static inline int ConvOutputSize (
@@ -367,66 +367,66 @@ __device__ half DmcnIm2colBilinear<half>(const half* bottom_data,
367367
368368template <typename T>
369369__global__ void ModulatedDeformableIm2colGpuKernel (
370- const int64_t nthreads,
370+ const int nthreads,
371371 const T* data_im,
372372 const T* data_offset,
373373 const T* data_mask,
374- const int64_t height,
375- const int64_t width,
376- const int64_t kernel_h,
377- const int64_t kernel_w,
378- const int64_t pad_h,
379- const int64_t pad_w,
380- const int64_t stride_h,
381- const int64_t stride_w,
382- const int64_t dilation_h,
383- const int64_t dilation_w,
384- const int64_t channel_per_deformable_group,
385- const int64_t batch_size,
386- const int64_t num_channels,
387- const int64_t deformable_group,
388- const int64_t height_col,
389- const int64_t width_col,
374+ const int height,
375+ const int width,
376+ const int kernel_h,
377+ const int kernel_w,
378+ const int pad_h,
379+ const int pad_w,
380+ const int stride_h,
381+ const int stride_w,
382+ const int dilation_h,
383+ const int dilation_w,
384+ const int channel_per_deformable_group,
385+ const int batch_size,
386+ const int num_channels,
387+ const int deformable_group,
388+ const int height_col,
389+ const int width_col,
390390 T* data_col);
391391
392392template <>
393393__global__ void ModulatedDeformableIm2colGpuKernel<float >(
394- const int64_t nthreads,
394+ const int nthreads,
395395 const float * data_im,
396396 const float * data_offset,
397397 const float * data_mask,
398- const int64_t height,
399- const int64_t width,
400- const int64_t kernel_h,
401- const int64_t kernel_w,
402- const int64_t pad_h,
403- const int64_t pad_w,
404- const int64_t stride_h,
405- const int64_t stride_w,
406- const int64_t dilation_h,
407- const int64_t dilation_w,
408- const int64_t channel_per_deformable_group,
409- const int64_t batch_size,
410- const int64_t num_channels,
411- const int64_t deformable_group,
412- const int64_t height_col,
413- const int64_t width_col,
398+ const int height,
399+ const int width,
400+ const int kernel_h,
401+ const int kernel_w,
402+ const int pad_h,
403+ const int pad_w,
404+ const int stride_h,
405+ const int stride_w,
406+ const int dilation_h,
407+ const int dilation_w,
408+ const int channel_per_deformable_group,
409+ const int batch_size,
410+ const int num_channels,
411+ const int deformable_group,
412+ const int height_col,
413+ const int width_col,
414414 float * data_col) {
415- int64_t index = static_cast < int64_t >( blockIdx .x ) * blockDim .x + threadIdx .x ;
416- int64_t offset = blockDim .x * static_cast < int64_t >( gridDim .x ) ;
415+ int index = blockIdx .x * blockDim .x + threadIdx .x ;
416+ int offset = blockDim .x * gridDim .x ;
417417
418418 float minus_one = -1 .0f , height_t = height, width_t = width;
419- for (int64_t i = index; i < nthreads; i += offset) {
420- const int64_t w_col = i % width_col;
421- const int64_t h_col = (i / width_col) % height_col;
422- const int64_t b_col = (i / width_col) / height_col % batch_size;
423- const int64_t c_im = (i / width_col / height_col) / batch_size;
424- const int64_t c_col = c_im * kernel_h * kernel_w;
419+ for (size_t i = index; i < nthreads; i += offset) {
420+ const int w_col = i % width_col;
421+ const int h_col = (i / width_col) % height_col;
422+ const int b_col = (i / width_col) / height_col % batch_size;
423+ const int c_im = (i / width_col / height_col) / batch_size;
424+ const int c_col = c_im * kernel_h * kernel_w;
425425
426- const int64_t deformable_group_index = c_im / channel_per_deformable_group;
426+ const int deformable_group_index = c_im / channel_per_deformable_group;
427427
428- const int64_t h_in = h_col * stride_h - pad_h;
429- const int64_t w_in = w_col * stride_w - pad_w;
428+ const int h_in = h_col * stride_h - pad_h;
429+ const int w_in = w_col * stride_w - pad_w;
430430
431431 float * data_col_ptr =
432432 data_col +
@@ -440,14 +440,14 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
440440 data_mask + (b_col * deformable_group + deformable_group_index) *
441441 kernel_h * kernel_w * height_col * width_col;
442442
443- for (int64_t i = 0 ; i < kernel_h; ++i) {
444- for (int64_t j = 0 ; j < kernel_w; ++j) {
445- const int64_t data_offset_h_ptr =
443+ for (int i = 0 ; i < kernel_h; ++i) {
444+ for (int j = 0 ; j < kernel_w; ++j) {
445+ const int data_offset_h_ptr =
446446 ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
447- const int64_t data_offset_w_ptr =
447+ const int data_offset_w_ptr =
448448 ((2 * (i * kernel_w + j) + 1 ) * height_col + h_col) * width_col +
449449 w_col;
450- const int64_t data_mask_hw_ptr =
450+ const int data_mask_hw_ptr =
451451 ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
452452
453453 const float offset_h = data_offset_ptr[data_offset_h_ptr];
@@ -471,45 +471,43 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
471471
472472template <>
473473__global__ void ModulatedDeformableIm2colGpuKernel<half>(
474- const int64_t nthreads,
474+ const int nthreads,
475475 const half* data_im,
476476 const half* data_offset,
477477 const half* data_mask,
478- const int64_t height,
479- const int64_t width,
480- const int64_t kernel_h,
481- const int64_t kernel_w,
482- const int64_t pad_h,
483- const int64_t pad_w,
484- const int64_t stride_h,
485- const int64_t stride_w,
486- const int64_t dilation_h,
487- const int64_t dilation_w,
488- const int64_t channel_per_deformable_group,
489- const int64_t batch_size,
490- const int64_t num_channels,
491- const int64_t deformable_group,
492- const int64_t height_col,
493- const int64_t width_col,
478+ const int height,
479+ const int width,
480+ const int kernel_h,
481+ const int kernel_w,
482+ const int pad_h,
483+ const int pad_w,
484+ const int stride_h,
485+ const int stride_w,
486+ const int dilation_h,
487+ const int dilation_w,
488+ const int channel_per_deformable_group,
489+ const int batch_size,
490+ const int num_channels,
491+ const int deformable_group,
492+ const int height_col,
493+ const int width_col,
494494 half* data_col) {
495495#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
496- int64_t index = static_cast < int64_t >( blockIdx .x ) * blockDim .x + threadIdx .x ;
497- int64_t offset = blockDim .x * static_cast < int64_t >( gridDim .x ) ;
496+ int index = blockIdx .x * blockDim .x + threadIdx .x ;
497+ int offset = blockDim .x * gridDim .x ;
498498
499- half minus_one = -1 .0f ,
500- height_t = static_cast <half>(static_cast <float >(height)),
501- width_t = static_cast <half>(static_cast <float >(width));
499+ half minus_one = -1 .0f , height_t = height, width_t = width;
502500 for (size_t i = index; i < nthreads; i += offset) {
503- const int64_t w_col = i % width_col;
504- const int64_t h_col = (i / width_col) % height_col;
505- const int64_t b_col = (i / width_col) / height_col % batch_size;
506- const int64_t c_im = (i / width_col / height_col) / batch_size;
507- const int64_t c_col = c_im * kernel_h * kernel_w;
501+ const int w_col = i % width_col;
502+ const int h_col = (i / width_col) % height_col;
503+ const int b_col = (i / width_col) / height_col % batch_size;
504+ const int c_im = (i / width_col / height_col) / batch_size;
505+ const int c_col = c_im * kernel_h * kernel_w;
508506
509- const int64_t deformable_group_index = c_im / channel_per_deformable_group;
507+ const int deformable_group_index = c_im / channel_per_deformable_group;
510508
511- const int64_t h_in = h_col * stride_h - pad_h;
512- const int64_t w_in = w_col * stride_w - pad_w;
509+ const int h_in = h_col * stride_h - pad_h;
510+ const int w_in = w_col * stride_w - pad_w;
513511
514512 half* data_col_ptr =
515513 data_col +
@@ -523,22 +521,21 @@ __global__ void ModulatedDeformableIm2colGpuKernel<half>(
523521 data_mask + (b_col * deformable_group + deformable_group_index) *
524522 kernel_h * kernel_w * height_col * width_col;
525523
526- for (int64_t i = 0 ; i < kernel_h; ++i) {
527- for (int64_t j = 0 ; j < kernel_w; ++j) {
528- const int64_t data_offset_h_ptr =
524+ for (int i = 0 ; i < kernel_h; ++i) {
525+ for (int j = 0 ; j < kernel_w; ++j) {
526+ const int data_offset_h_ptr =
529527 ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
530- const int64_t data_offset_w_ptr =
528+ const int data_offset_w_ptr =
531529 ((2 * (i * kernel_w + j) + 1 ) * height_col + h_col) * width_col +
532530 w_col;
533- const int64_t data_mask_hw_ptr =
531+ const int data_mask_hw_ptr =
534532 ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
535533
536534 const half offset_h = data_offset_ptr[data_offset_h_ptr];
537535 const half offset_w = data_offset_ptr[data_offset_w_ptr];
538536 const half mask = data_mask_ptr[data_mask_hw_ptr];
539537 half val = 0 ;
540- half h_im_t = static_cast <float >(h_in) + i * dilation_h,
541- w_im_t = static_cast <float >(w_in) + j * dilation_w;
538+ half h_im_t = h_in + i * dilation_h, w_im_t = w_in + j * dilation_w;
542539 const half h_im = h_im_t + offset_h;
543540 const half w_im = w_im_t + offset_w;
544541 if (h_im > minus_one && w_im > minus_one && h_im < height_t &&
0 commit comments