sync with pytorch#5124

liligwu · liligwu · commit 685d3674b920 · 2025-11-13T23:03:36.000Z
diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
@@ -31,6 +31,10 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
   index_t invalid_i = -1, invalid_idx = -1;
   int32_t invalid_b_t = -1;
   int64_t warning_inc = 0;
+  __shared__ int64_t block_warning_buffer[kMaxThreads];
+  const int linear_tid = threadIdx.z * (blockDim.y * blockDim.x) +
+      threadIdx.y * blockDim.x + threadIdx.x;
+  const int active_threads = blockDim.x * blockDim.y * blockDim.z;
 
   // Check the last element
   if (b_t_start == 0 && threadIdx.x == 0) {
@@ -142,9 +146,26 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
     }
   } // for b_t
 
-  if (warning_inc > 0) {
-    gpuAtomicAdd(&warning[0], warning_inc);
+  // Accumulate per-thread warning counts in shared memory and reduce once per block.
+  block_warning_buffer[linear_tid] = warning_inc;
+  __syncthreads();
+
+  // Parallel tree reduction
+  for (int stride = active_threads / 2; stride > 0; stride >>= 1) {
+    if (linear_tid < stride) {
+      block_warning_buffer[linear_tid] += block_warning_buffer[linear_tid + stride];
+    }
+    __syncthreads();
+  }
+
+  // Thread 0 has the final sum
+  if (linear_tid == 0) {
+    int64_t block_warning_sum = block_warning_buffer[0];
+    if (block_warning_sum > 0) {
+      gpuAtomicAdd(&warning[0], block_warning_sum);
+    }
   }
+  __syncthreads();
   if (bounds_check_mode == BoundsCheckMode::WARNING && invalid_i != -1 &&
       static_cast<int64_t>(atomicAdd(
           reinterpret_cast<unsigned long long int*>(&warning[0]), 0)) == 0) {