Triton perf improvements, added poi tuning config

iupaikov-amd · iupaikov-amd · commit 5f9e61154627 · 2025-09-24T06:14:31.000-05:00
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -1027,11 +1027,11 @@ def relu(x):
 
     @staticmethod
     def minimum(a, b):
-        return f"triton_helpers.minimum({a}, {b})"
+        return f"tl.minimum({a}, {b})"
 
     @staticmethod
     def maximum(a, b):
-        return f"triton_helpers.maximum({a}, {b})"
+        return f"tl.maximum({a}, {b})"
 
     @staticmethod
     def where(a, b, c):
@@ -1217,7 +1217,7 @@ def load_seed(name, offset):
     @staticmethod
     @maybe_upcast_float32()
     def rsqrt(x):
-        return f"libdevice.rsqrt({x})"
+        return f"tl.rsqrt({x})"
 
     @staticmethod
     @maybe_upcast_float32()
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
@@ -13,7 +13,7 @@
 # The following maximums only apply to runtime autotuning, when using FixedTritonConfig one may see larger values
 # NOTE: if these fail asserts submit a PR to increase them
 TRITON_MAX_BLOCK = {
-    "X": 4096,
+    "X": 8192,
     "Y": 1024,
     "Z": 1024,
     "R0_": 4096 * 16,  # * 16 is multi-kernel only
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -1987,6 +1987,9 @@ def triton_config(
     num_stages=1,
     num_elements_per_warp=256,
     min_elem_per_thread=0,
+    num_warps=None,
+    matrix_instr=None,
+    waves_per_eu=None
 ) -> Config:
     """
     Construct a pointwise triton config with some adjustment heuristics
@@ -2043,9 +2046,11 @@ def triton_config(
     ):
         z *= 2
 
-    num_warps = _num_warps(
-        conditional_product(x, y, z) // num_elements_per_warp, min_num_warps=1
-    )
+    # Calculate num_waprs if they are not hard passed to config
+    if num_warps == None:
+        num_warps = _num_warps(
+            conditional_product(x, y, z) // num_elements_per_warp, min_num_warps=1
+        )
     # we are going to arrive at 2 warps only if bs was too small due to
     # numel being too small. However to workaround some ptx bugs we still
     # want at least 4 warps if there's enough elements per thread
@@ -2075,7 +2080,15 @@ def triton_config(
         cfg["ZBLOCK"] = z
     check_max_block(cfg)
     check_config(cfg, xnumel=xnumel, ynumel=ynumel, znumel=znumel)
-    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+    config = Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+    if torch.version.hip:
+        if matrix_instr is not None:
+            config.kwargs["matrix_instr_nonkdim"] = matrix_instr
+        if waves_per_eu is not None:
+            config.kwargs["waves_per_eu"] = waves_per_eu
+
+    return config
 
 
 def _get_nd_reduction_numels(r: int, size_hints: dict[str, int]) -> dict[str, int]:
@@ -2123,6 +2136,7 @@ def triton_config_reduction(
     num_stages=1,
     num_warps=None,
     register_intensive=False,
+    waves_per_eu=None
 ) -> Config:
     """
     Construct a reduction triton config with some adjustment heuristics
@@ -2166,7 +2180,13 @@ def total_numel() -> int:
     cfg = _get_config({"x": x, **rnumels})
     check_max_block(cfg)
     check_config(cfg, xnumel=size_hints["x"])
-    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+    config = Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+    if torch.version.hip:
+        if waves_per_eu is not None:
+            config.kwargs["waves_per_eu"] = waves_per_eu
+
+    return config
 
 
 def _get_config(numels: dict[str, int]) -> dict[str, int]:
@@ -2259,6 +2279,12 @@ def pointwise(
                 triton_config_with_settings(
                     size_hints, bs // 2, num_elements_per_warp=64
                 ),
+                # triton_config_with_settings(
+                #     size_hints, 8192, num_warps=8, num_stages=1, matrix_instr=0, waves_per_eu=2
+                # ),
+                triton_config_with_settings(
+                    size_hints, TRITON_MAX_BLOCK["X"], waves_per_eu=2
+                ),
                 *hinted_configs,
             ]
     if len(size_hints) == 2:
@@ -2491,6 +2517,24 @@ def reduction(
     assert triton_meta is not None
 
     configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+
+    # Additional tuning confirgs for ROCm builds
+    # Add checks for reduction autotuning bools
+    # if torch.version.hip and inductor_meta.get("max_autotune"):
+    #     configs = [
+    #         triton_config_with_settings(size_hints, bs, num_elements_per_warp=256),
+    #         triton_config_with_settings(
+    #             size_hints, bs // 2, num_elements_per_warp=64
+    #         ),
+    #         # triton_config_with_settings(
+    #         #     size_hints, 8192, num_warps=8, num_stages=1, matrix_instr=0, waves_per_eu=2
+    #         # ),
+    #         triton_config_with_settings(
+    #             size_hints, TRITON_MAX_BLOCK["X"], waves_per_eu=2
+    #         ),
+    #         *hinted_configs,
+    #     ]
+
     return cached_autotune(
         size_hints,
         configs=configs,