[NO CP][release/2.7][ROCm][inductor] Inductor heuristic upstream backports (#2807)

naromero77amd · jataylo · AmdSampsa · web-flow · commit 7de121427206 · 2025-11-26T11:10:04.000-06:00
These are backports based on these upstream PRs. Cherrypicks were performed when they where possible. pytorch#163908 (persistent reduction autotune) pytorch#161280 (reduction) pytorch#162053 (foreach) pytorch#163197 (pointwise) pytorch#166470 (pointwise config for atomic add) Also included are some additional customer-specific configs which were not upstreamed but are in this backport to 2.9 #2723 Did not backport filter functions such as ` _maybe_filter_configs_for_tma_restrictions` https://github.com/ROCm/pytorch/blob/release/2.9/torch/_inductor/runtime/triton_heuristics.py#L2614 --------- Co-authored-by: Jack Taylor <jack.taylor@amd.com> Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com> Co-authored-by: Sampsa Riikonen <sriikone@amd.com> Co-authored-by: AmdSampsa <sampsa.riikonen@amd.com>
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
@@ -1873,6 +1873,7 @@ def __init__(
         self.compute = IndentedBuffer()
         self.stores = IndentedBuffer()
 
+        self.atomic_add_found = False
         self.num_load = 0
         self.num_reduction = 0
 
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -1002,11 +1002,17 @@ def relu(x):
 
     @staticmethod
     def minimum(a, b):
-        return f"triton_helpers.minimum({a}, {b})"
+        if torch.version.hip:
+            return f"tl.minimum({a}, {b}, tl.PropagateNan.ALL)"
+        else:
+            return f"triton_helpers.minimum({a}, {b})"
 
     @staticmethod
     def maximum(a, b):
-        return f"triton_helpers.maximum({a}, {b})"
+        if torch.version.hip:
+            return f"tl.maximum({a}, {b}, tl.PropagateNan.ALL)"
+        else:
+            return f"triton_helpers.maximum({a}, {b})"
 
     @staticmethod
     def where(a, b, c):
@@ -1202,7 +1208,10 @@ def load_seed(name, offset):
     @staticmethod
     @maybe_upcast_float32()
     def rsqrt(x):
-        return f"libdevice.rsqrt({x})"
+        if torch.version.hip:
+            return f"tl.rsqrt({x})"
+        else:
+            return f"libdevice.rsqrt({x})"
 
     @staticmethod
     @maybe_upcast_float32()
@@ -2284,6 +2293,7 @@ def store(
         elif mode is None:
             line = f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})"
         elif mode == "atomic_add":
+            self.atomic_add_found = True
             line = f"tl.atomic_add({var} + ({indexing.index_str}), {value}, {indexing.mask_str}, sem='relaxed')"
         else:
             raise NotImplementedError(f"store mode={mode}")
@@ -3226,8 +3236,9 @@ def codegen_body(self):
                     loop_end = (
                         "rsplit_end" if self.cooperative_reduction else f"{prefix}numel"
                     )
+                    num_stages = ", num_stages = 2" if torch.version.hip else ""
                     self.body.writeline(
-                        f"for {prefix}offset in range({loop_start}, {loop_end}, {prefix.upper()}BLOCK):"
+                        f"for {prefix}offset in tl.range({loop_start}, {loop_end}, {prefix.upper()}BLOCK{num_stages}):"
                     )
                 with self.body.indent(offset=level + 1):
                     self.iteration_ranges_codegen_header(tree, self.body)
@@ -3600,6 +3611,7 @@ def add_constexpr_arg(arg_name):
             "mutated_arg_names": mutated_args,
             "optimize_mem": optimize_mem,
             "no_x_dim": self.no_x_dim,
+            "atomic_add_found": self.atomic_add_found,
             "num_load": self.num_load,
             "num_reduction": self.num_reduction,
             **self.inductor_meta_common(),
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -1135,7 +1135,7 @@ class triton:
     # So far we see a fixed 8 spilled registers for kernels using sin/cos.
     # Raise the threshold to 16 to be safe.
     # We should revisit this once we understand more of the source of register spills.
-    spill_threshold: int = 16
+    spill_threshold: int = 32 if torch.version.hip else 16
 
     # Generate code containing the newer tl.make_block_ptr() API for loads/store
     use_block_ptr = False
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
@@ -7,13 +7,14 @@
 from enum import auto, Enum
 from typing import Optional, Union
 
+import torch
 from torch.utils._triton import has_triton_package
 
 
 # The following maximums only apply to runtime autotuning, when using FixedTritonConfig one may see larger values
 # NOTE: if these fail asserts submit a PR to increase them
 TRITON_MAX_BLOCK = {
-    "X": 4096,
+    "X": 8192 if torch.version.hip else 4096,
     "Y": 1024,
     "Z": 1024,
     "R0_": 4096 * 16,  # * 16 is multi-kernel only
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py