[AUTOGENERATED] [release/2.8] [release/2.7] [SWDEV-543214] Reland #2416 Fix warps runtime part 2 (#2455)

okakarpa · jataylo · jithunnair-amd · commit 5e67be1eabbb · 2025-10-10T14:55:54.000-05:00
Cherry-pick of #2442 Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com> (cherry picked from commit 77a6760)
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -63,9 +63,14 @@ def get_config_max(self, prefix: str) -> int:
 
     @lru_cache(maxsize=1)
     def get_warpsmax(self):
-        # Currently, CUDA has a maximum of 1024 threads, so 32 is the max
-        # number of warps.
-        return 1024 // 32
+        # CUDA/ROCm has a maximum of 1024 threads per block
+        from torch.cuda import current_device, get_device_properties, is_available
+        
+        warp_size = (
+            get_device_properties(current_device()).warp_size if is_available() else 32
+        )
+
+        return 1024 // warp_size
 
     def cache_benchmark_result(self, config, timing):
         self.cached_benchmark_results[triton_config_to_hashable(config)] = timing