[Release 2.9] Inductor perf cherry picks (#2723)

jataylo · naromero77amd · AmdSampsa · jeffdaily · commit bf92b1adb77b · 2025-11-17T20:42:55.000Z
These changes are currently in progress of being upstreamed. Bring into
release 2.9 for customer model perf improvement

---------

Co-authored-by: Nichols A. Romero &lt;nick.romero@amd.com&gt;
Co-authored-by: Sampsa Riikonen &lt;sriikone@amd.com&gt;
Co-authored-by: Nichols A. Romero &lt;165712832+naromero77amd@users.noreply.github.com&gt;
Co-authored-by: AmdSampsa &lt;sampsa.riikonen@amd.com&gt;
diff --git a/test/inductor/test_async_compile.py b/test/inductor/test_async_compile.py
@@ -74,7 +74,14 @@ def f(a, b):
             return (a @ b).to(torch.float32).sum(dim=1)
 
         # Fake name to make sure the lookup table is name agnostic
-        func_def = """
+        # When codegen/triton.py is changed, func_def must be updated
+        loop_header = (
+            "for r0_offset in tl.range(0, r0_numel, R0_BLOCK, num_stages = 2):"
+            if torch.version.hip
+            else "for r0_offset in range(0, r0_numel, R0_BLOCK):"
+        )
+
+        func_def = f"""
 def triton_fused_fake_name(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
     xnumel = 1024
     r0_numel = 11776
@@ -87,7 +94,7 @@ def triton_fused_fake_name(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.cons
     rbase = r0_base
     x0 = xindex
     _tmp3 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
-    for r0_offset in range(0, r0_numel, R0_BLOCK):
+    {loop_header}
         r0_index = r0_offset + r0_base
         r0_mask = r0_index < r0_numel
         roffset = r0_offset
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -1101,11 +1101,17 @@ def relu(x):
 
     @staticmethod
     def minimum(a, b):
-        return f"triton_helpers.minimum({a}, {b})"
+        if torch.version.hip:
+            return f"tl.minimum({a}, {b}, tl.PropagateNan.ALL)"
+        else:
+            return f"triton_helpers.minimum({a}, {b})"
 
     @staticmethod
     def maximum(a, b):
-        return f"triton_helpers.maximum({a}, {b})"
+        if torch.version.hip:
+            return f"tl.maximum({a}, {b}, tl.PropagateNan.ALL)"
+        else:
+            return f"triton_helpers.maximum({a}, {b})"
 
     @staticmethod
     def where(a, b, c):
@@ -1291,7 +1297,10 @@ def load_seed(name, offset):
     @staticmethod
     @maybe_upcast_float32()
     def rsqrt(x):
-        return f"libdevice.rsqrt({x})"
+        if torch.version.hip:
+            return f"tl.rsqrt({x})"
+        else:
+            return f"libdevice.rsqrt({x})"
 
     @staticmethod
     @maybe_upcast_float32()
@@ -3788,8 +3797,9 @@ def codegen_body(self):
                     loop_end = (
                         "rsplit_end" if self.cooperative_reduction else f"{prefix}numel"
                     )
+                    num_stages = ", num_stages = 2" if torch.version.hip else ""
                     self.body.writeline(
-                        f"for {prefix}offset in range({loop_start}, {loop_end}, {prefix.upper()}BLOCK):"
+                        f"for {prefix}offset in tl.range({loop_start}, {loop_end}, {prefix.upper()}BLOCK{num_stages}):"
                     )
                 with self.body.indent(offset=level + 1):
                     self.iteration_ranges_codegen_header(tree, self.body)
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -1395,7 +1395,7 @@ class triton:
     # So far we see a fixed 8 spilled registers for kernels using sin/cos.
     # Raise the threshold to 16 to be safe.
     # We should revisit this once we understand more of the source of register spills.
-    spill_threshold: int = 16
+    spill_threshold: int = 32 if torch.version.hip else 16
 
     # Generate code containing the newer tl.make_block_ptr() API for loads/store
     use_block_ptr = False
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
@@ -13,7 +13,7 @@
 # The following maximums only apply to runtime autotuning, when using FixedTritonConfig one may see larger values
 # NOTE: if these fail asserts submit a PR to increase them
 TRITON_MAX_BLOCK = {
-    "X": 4096,
+    "X": 8192,
     "Y": 1024,
     "Z": 1024,
     "R0_": 4096 * 16,  # * 16 is multi-kernel only
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py