@@ -84,10 +84,10 @@ __global__ static void gather_keep_from_mask(bool *keep,
8484 const int col_blocks = ceil_div (n_boxes, threadsPerBlock);
8585 const int thread_id = threadIdx .x ;
8686
87- // mark the bboxes which have been removed.
87+ // Mark the bboxes which have been removed.
8888 extern __shared__ unsigned long long removed[];
8989
90- // initialize removed.
90+ // Initialize removed.
9191 for (int i = thread_id; i < col_blocks; i += blockDim .x ) {
9292 removed[i] = 0 ;
9393 }
@@ -101,14 +101,13 @@ __global__ static void gather_keep_from_mask(bool *keep,
101101 for (int inblock = 0 ; inblock < threadsPerBlock; inblock++) {
102102 const int i = i_offset + inblock;
103103 if (i >= n_boxes) break ;
104- // select a candidate, check if it should kept.
104+ // Select a candidate, check if it should kept.
105105 if (!(removed_val & (1ULL << inblock))) {
106106 if (thread_id == 0 ) {
107- // mark the output.
108107 keep[i] = true ;
109108 }
110109 auto p = dev_mask + i * col_blocks;
111- // remove all bboxes which overlap the candidate.
110+ // Remove all bboxes which overlap the candidate.
112111 for (int j = thread_id; j < col_blocks; j += blockDim .x ) {
113112 if (j >= nblock) removed[j] |= p[j];
114113 }
@@ -181,10 +180,10 @@ at::Tensor nms_kernel(
181180 );
182181
183182 // Unwrap the mask to fill keep with proper values
184- // Keeping this unwrap on cuda instead of applying iterative for loops on cpu
183+ // Keeping the unwrap on device instead of applying iterative for loops on cpu
185184 // prevents the device -> cpu -> device transfer that could be bottleneck for
186185 // large number of boxes.
187- // See https://github.com/pytorch/vision/issues/8713 for more details
186+ // See https://github.com/pytorch/vision/issues/8713 for more details.
188187 gather_keep_from_mask<<<1 , min(col_blocks, threadsPerBlock),
189188 col_blocks * sizeof (unsigned long long ), stream>>> (
190189 keep.data_ptr <bool >(), (unsigned long long *)mask.data_ptr <int64_t >(),
0 commit comments