[Gluon] Add an opt pass pipeline for gluon (#6992)

Mogball · web-flow · commit c109dc79e57d · 2025-05-30T12:23:15.000-07:00
This PR adds a separate TTGIR optimization pass for code parsed from
Gluon directly. Most important is the inliner and basic running
TTGIR-level optimizations.

I added passes that I thought were obviously good to have.
diff --git a/include/triton/Dialect/Triton/IR/Interfaces.h b/include/triton/Dialect/Triton/IR/Interfaces.h
@@ -1,9 +1,45 @@
 #ifndef TRITON_IR_INTERFACES_H_
 #define TRITON_IR_INTERFACES_H_
 
+#include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Transforms/InliningUtils.h"
 
 #define GET_TYPEDEF_CLASSES
 #include "triton/Dialect/Triton/IR/AttrInterfaces.h.inc"
 
+namespace mlir::triton {
+
+//===----------------------------------------------------------------------===//
+// TritonDialect Dialect Interfaces
+//===----------------------------------------------------------------------===//
+
+struct TritonInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  bool isLegalToInline(Operation *call, Operation *callable,
+                       bool wouldBeCloned) const final;
+  bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
+                       IRMapping &valueMapping) const final {
+    return true;
+  }
+  bool isLegalToInline(Operation *, Region *, bool wouldBeCloned,
+                       IRMapping &) const final {
+    return true;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op, Block *newDest) const final;
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op, ValueRange valuesToRepl) const final;
+};
+
+} // namespace mlir::triton
+
 #endif // TRITON_IR_TYPES_H_
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Passes.td b/include/triton/Dialect/TritonGPU/Transforms/Passes.td
@@ -360,4 +360,17 @@ def TritonGPUCoalesceAsyncCopy: Pass<"tritongpu-coalesce-async-copy", "mlir::Mod
                            "mlir::triton::TritonDialect"];
 }
 
+def TritonGPUCanonicalize: Pass<"tritongpu-canonicalize"> {
+  let summary = "reduced set of simplifications for TTGIR";
+
+  let description = [{
+    The `tritongpu-canonicalize` pass applies a reduced set of simplification
+    and canonicalization patterns to the module.
+  }];
+  let dependentDialects = [
+    "mlir::arith::ArithDialect",
+    "mlir::scf::SCFDialect",
+  ];
+}
+
 #endif
diff --git a/lib/Dialect/Triton/IR/Dialect.cpp b/lib/Dialect/Triton/IR/Dialect.cpp
@@ -1,16 +1,12 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Interfaces.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/TypeSwitch.h"
-#include "llvm/Support/raw_ostream.h"
 
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/IR/DialectImplementation.h"
-
-#include "mlir/Transforms/InliningUtils.h"
 #include "triton/Dialect/Triton/IR/AttrInterfaces.cpp.inc"
 #include "triton/Dialect/Triton/IR/Dialect.cpp.inc"
 #include "triton/Dialect/Triton/IR/OpInterfaces.cpp.inc"
@@ -22,62 +18,45 @@ using namespace mlir::triton;
 // TritonDialect Dialect Interfaces
 //===----------------------------------------------------------------------===//
 
-namespace {
-struct TritonInlinerInterface : public DialectInlinerInterface {
-  using DialectInlinerInterface::DialectInlinerInterface;
-
-  bool isLegalToInline(Operation *call, Operation *callable,
-                       bool wouldBeCloned) const final {
-    auto funcOp = dyn_cast<triton::FuncOp>(callable);
-    if (!funcOp)
-      return true;
-    if (funcOp->hasAttr("noinline"))
-      return !funcOp->getAttrOfType<BoolAttr>("noinline").getValue();
-    return true;
-  }
-
-  bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
-                       IRMapping &valueMapping) const final {
-    return true;
-  }
-
-  bool isLegalToInline(Operation *, Region *, bool wouldBeCloned,
-                       IRMapping &) const final {
+bool TritonInlinerInterface::isLegalToInline(Operation *call,
+                                             Operation *callable,
+                                             bool wouldBeCloned) const {
+  auto funcOp = dyn_cast<triton::FuncOp>(callable);
+  if (!funcOp)
     return true;
-  }
-  //===--------------------------------------------------------------------===//
-  // Transformation Hooks
-  //===--------------------------------------------------------------------===//
-
-  /// Handle the given inlined terminator by replacing it with a new operation
-  /// as necessary.
-  void handleTerminator(Operation *op, Block *newDest) const final {
-    // Only return needs to be handled here.
-    auto returnOp = dyn_cast<triton::ReturnOp>(op);
-    if (!returnOp)
-      return;
-
-    // Replace the return with a branch to the dest.
-    OpBuilder builder(op);
-    builder.create<mlir::cf::BranchOp>(op->getLoc(), newDest,
-                                       returnOp.getOperands());
-    op->erase();
-  }
-
-  /// Handle the given inlined terminator by replacing it with a new operation
-  /// as necessary.
-  void handleTerminator(Operation *op, ValueRange valuesToRepl) const final {
-    // Only return needs to be handled here.
-    auto returnOp = cast<triton::ReturnOp>(op);
+  if (funcOp->hasAttr("noinline"))
+    return !funcOp->getAttrOfType<BoolAttr>("noinline").getValue();
+  return true;
+}
 
-    // Replace the values directly with the return operands.
-    assert(returnOp.getNumOperands() == valuesToRepl.size());
-    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
-      valuesToRepl[it.index()].replaceAllUsesWith(it.value());
-  }
-};
+/// Handle the given inlined terminator by replacing it with a new operation
+/// as necessary.
+void TritonInlinerInterface::handleTerminator(Operation *op,
+                                              Block *newDest) const {
+  // Only return needs to be handled here.
+  auto returnOp = dyn_cast<triton::ReturnOp>(op);
+  if (!returnOp)
+    return;
+
+  // Replace the return with a branch to the dest.
+  OpBuilder builder(op);
+  builder.create<mlir::cf::BranchOp>(op->getLoc(), newDest,
+                                     returnOp.getOperands());
+  op->erase();
+}
 
-} // namespace
+/// Handle the given inlined terminator by replacing it with a new operation
+/// as necessary.
+void TritonInlinerInterface::handleTerminator(Operation *op,
+                                              ValueRange valuesToRepl) const {
+  // Only return needs to be handled here.
+  auto returnOp = cast<triton::ReturnOp>(op);
+
+  // Replace the values directly with the return operands.
+  assert(returnOp.getNumOperands() == valuesToRepl.size());
+  for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+    valuesToRepl[it.index()].replaceAllUsesWith(it.value());
+}
 
 void TritonDialect::initialize() {
   registerTypes();
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -7,6 +7,7 @@
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Support/LLVM.h"
 #include "triton/Analysis/Utility.h"
+#include "triton/Dialect/Triton/IR/Interfaces.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -3083,6 +3084,7 @@ void TritonGPUDialect::initialize() {
 #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc"
 #include "triton/Dialect/TritonGPU/IR/OpsEnums.cpp.inc"
       >();
+  addInterfaces<TritonInlinerInterface>();
   addInterfaces<TritonGPUOpAsmInterface>();
   addInterfaces<TritonGPUInferLayoutInterface>();
   addInterfaces<TritonGPUVerifyTensorLayoutInterface>();
diff --git a/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt b/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_triton_library(TritonGPUTransforms
   AccelerateMatmul.cpp
+  Canonicalize.cpp
   Coalesce.cpp
   F32DotTC.cpp
   FuseNestedLoops.cpp
diff --git a/lib/Dialect/TritonGPU/Transforms/Canonicalize.cpp b/lib/Dialect/TritonGPU/Transforms/Canonicalize.cpp
@@ -0,0 +1,46 @@
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+
+using namespace mlir;
+using namespace triton;
+namespace ttg = triton::gpu;
+namespace ttng = triton::nvidia_gpu;
+
+namespace mlir::triton::gpu {
+#define GEN_PASS_DEF_TRITONGPUCANONICALIZE
+#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
+} // namespace mlir::triton::gpu
+
+namespace {
+struct Canonicalize
+    : public ttg::impl::TritonGPUCanonicalizeBase<Canonicalize> {
+  void runOnOperation() override;
+};
+} // namespace
+
+void Canonicalize::runOnOperation() {
+  MLIRContext *ctx = &getContext();
+  RewritePatternSet patterns(&getContext());
+
+  // Populate `arith` and `scf` canonicalizers.
+  ctx->getLoadedDialect<arith::ArithDialect>()->getCanonicalizationPatterns(
+      patterns);
+  ctx->getLoadedDialect<scf::SCFDialect>()->getCanonicalizationPatterns(
+      patterns);
+  populateForOpDeadArgumentElimination(patterns);
+
+  // Populate select Triton canonicalization patterns. The important patterns to
+  // EXCLUDE are those that modify layouts, especially `ConvertLayoutOp`
+  // patterns.
+  LoadOp::getCanonicalizationPatterns(patterns, ctx);
+  StoreOp::getCanonicalizationPatterns(patterns, ctx);
+  BroadcastOp::getCanonicalizationPatterns(patterns, ctx);
+  ExpandDimsOp::getCanonicalizationPatterns(patterns, ctx);
+  ttg::WarpSpecializeOp::getCanonicalizationPatterns(patterns, ctx);
+  ttng::TensorDescToTMAPtrOp::getCanonicalizationPatterns(patterns, ctx);
+}
diff --git a/python/src/passes.cc b/python/src/passes.cc
@@ -51,6 +51,7 @@ void init_triton_passes_ttir(py::module &&m) {
 }
 
 void init_triton_passes_ttgpuir(py::module &&m) {
+  using namespace mlir;
   using namespace mlir::triton::gpu;
   ADD_PASS_WRAPPER_0("add_coalesce", createTritonGPUCoalesce);
   ADD_PASS_WRAPPER_0("add_optimize_thread_locality",
@@ -85,6 +86,12 @@ void init_triton_passes_ttgpuir(py::module &&m) {
   ADD_PASS_WRAPPER_0("add_fuse_nested_loops", createTritonGPUFuseNestedLoops);
   ADD_PASS_WRAPPER_0("add_coalesce_async_copy",
                      createTritonGPUCoalesceAsyncCopy);
+  ADD_PASS_WRAPPER_0("add_canonicalizer", createTritonGPUCanonicalize);
+  ADD_PASS_WRAPPER_0("add_inliner", [] {
+    return createInlinerPass(/*opPipelines=*/{}, [](OpPassManager &pm) {
+      pm.addPass(createTritonGPUCanonicalize());
+    });
+  });
 }
 
 void init_triton_passes_convert(py::module &&m) {
diff --git a/python/test/backend/test_device_backend.py b/python/test/backend/test_device_backend.py
@@ -94,7 +94,7 @@ def __init__(self, device_type: str) -> None:
         self.driver = ExtensionDriver()
         self.version_key = None
 
-    def add_stages(self, arch, extern_libs, stages):
+    def add_stages(self, stages, options, language):
         filter_in_stages = ["ast", "ttir", "ttgir"]
         filter_out_stages = []
         for key, _ in stages.items():
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -26,7 +26,7 @@ def test_convert_layout(fresh_knobs):
         1, ttgl.BlockedLayout(size_per_thread=[1, 1], threads_per_warp=[1, 32], warps_per_cta=[1, 4], order=[1, 0]))
     h = convert_layout_kernel.warmup(128, layout_a, layout_b, num_warps=layout_a.warps_per_cta[0], grid=(1, ))
     expecttest.assert_expected_inline(
-        h.asm["ttgir"], """\
+        h.asm["source"], """\
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
 module attributes {"ttg.num-warps" = 4 : i32} {
@@ -37,6 +37,15 @@ def test_convert_layout(fresh_knobs):
   } loc(#loc)
 } loc(#loc)
 #loc = loc(unknown)
+""")
+    expecttest.assert_expected_inline(
+        h.asm["ttgir"], """\
+module attributes {"ttg.num-warps" = 4 : i32} {
+  tt.func public @convert_layout_kernel() attributes {noinline = false} {
+    tt.return loc(#loc)
+  } loc(#loc)
+} loc(#loc)
+#loc = loc(unknown)
 """)
 
 
@@ -60,7 +69,7 @@ def test_shared_memory(fresh_knobs):
     h = shared_memory_kernel.warmup(8, 32, layout_a, layout_b, smem_layout, num_warps=layout_a.warps_per_cta[0],
                                     grid=(1, ))
     expecttest.assert_expected_inline(
-        h.asm["ttgir"], """\
+        h.asm["source"], """\
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
 #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32}>
@@ -103,7 +112,7 @@ def test_tensor_memory(fresh_knobs):
     tmem_layout = ttgl.nvidia.blackwell.TensorMemoryLayout(block=[128, 128], unpacked=True)
     h = tensor_memory_kernel.warmup(layout, tmem_layout, num_warps=4, grid=(1, ))
     expecttest.assert_expected_inline(
-        h.asm["ttgir"], """\
+        h.asm["source"], """\
 #blocked = #ttg.blocked<{sizePerThread = [1, 64], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
 module attributes {"ttg.num-warps" = 4 : i32} {
@@ -200,7 +209,7 @@ def test_mbarrier(fresh_knobs):
 
     h = mbarrier_kernel.warmup(grid=(1, ))
     expecttest.assert_expected_inline(
-        h.asm["ttgir"], """\
+        h.asm["source"], """\
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-warps" = 4 : i32} {
@@ -240,7 +249,7 @@ def test_tcgen05_mma(fresh_knobs):
 
     h = tcgen05_mma_kernel.warmup(nvmma_layout, acc_layout, grid=(1, ))
     expecttest.assert_expected_inline(
-        h.asm["ttgir"], """\
+        h.asm["source"], """\
 #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
 #smem = #ttg.shared_memory
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
diff --git a/python/triton/backends/compiler.py b/python/triton/backends/compiler.py
@@ -1,5 +1,6 @@
 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
+from enum import Enum
 from typing import Dict, Union
 from types import ModuleType
 
@@ -13,6 +14,12 @@ class GPUTarget(object):
     warp_size: int
 
 
+class Language(Enum):
+    """The input language being compiled by the backend."""
+    TRITON = 0
+    GLUON = 1
+
+
 class BaseBackend(metaclass=ABCMeta):
 
     def __init__(self, target: GPUTarget) -> None:
diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
diff --git a/python/triton/experimental/gluon/_runtime.py b/python/triton/experimental/gluon/_runtime.py
diff --git a/test/TritonGPU/inline.mlir b/test/TritonGPU/inline.mlir
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py