[CUDA][avgpool2d] Fix backward launch bounds again for sm100, sm120 (pytorch#150676)

pytorchbot · eqy · web-flow · commit 1cc51c640a71 · 2025-04-04T07:09:23.000-07:00
[CUDA][avgpool2d] Fix backward launch bounds again for `sm100`, `sm120` (pytorch#150640) `__CUDA_ARCH__` is not visible in host code, which causes incorrect launch bounds and `too many resources requested for launch` on blackwell Pull Request resolved: pytorch#150640 Approved by: https://github.com/malfet, https://github.com/drisspg, https://github.com/atalman (cherry picked from commit 09c4da9) Co-authored-by: Eddie Yan <eddiey@nvidia.com>
diff --git a/aten/src/ATen/native/cuda/AveragePool2d.cu b/aten/src/ATen/native/cuda/AveragePool2d.cu
@@ -402,11 +402,12 @@ TORCH_IMPL_FUNC(avg_pool2d_backward_out_cuda) (
   bool use_divisor = divisor_override.has_value();
   const auto divisor_override_value = use_divisor ? divisor_override.value() : 0;
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
-  constexpr int double_threads = 768;
-#else
-  constexpr int double_threads = 1024;
-#endif
+  cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
+  const bool gesm10x = properties->major >= 10;
+  int double_threads = 1024;
+  if (gesm10x) {
+    double_threads = 768;
+  }
 
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
     "avg_pool2d_backward_out_cuda_frame",