intel
diff --git a/‎include/triton/Dialect/Triton/IR/Interfaces.h‎
Lines changed: 36 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/Interfaces.h‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 13 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎lib/Dialect/Triton/IR/Dialect.cpp‎
Lines changed: 37 additions & 58 deletions b/‎lib/Dialect/Triton/IR/Dialect.cpp‎
Lines changed: 37 additions & 58 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Canonicalize.cpp‎
Lines changed: 46 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Canonicalize.cpp‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 57 additions & 0 deletions b/‎python/src/gluon_ir.cc‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎python/src/passes.cc‎
Lines changed: 7 additions & 0 deletions b/‎python/src/passes.cc‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎python/test/backend/test_device_backend.py‎
Lines changed: 1 addition & 1 deletion b/‎python/test/backend/test_device_backend.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,9 +1,45 @@
 #ifndef TRITON_IR_INTERFACES_H_
 #define TRITON_IR_INTERFACES_H_
 
+#include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Transforms/InliningUtils.h"
 
 #define GET_TYPEDEF_CLASSES
 #include "triton/Dialect/Triton/IR/AttrInterfaces.h.inc"
 
+namespace mlir::triton {
+
+//===----------------------------------------------------------------------===//
+// TritonDialect Dialect Interfaces
+//===----------------------------------------------------------------------===//
+
+struct TritonInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  bool isLegalToInline(Operation *call, Operation *callable,
+                       bool wouldBeCloned) const final;
+  bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
+                       IRMapping &valueMapping) const final {
+    return true;
+  }
+  bool isLegalToInline(Operation *, Region *, bool wouldBeCloned,
+                       IRMapping &) const final {
+    return true;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op, Block *newDest) const final;
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op, ValueRange valuesToRepl) const final;
+};
+
+} // namespace mlir::triton
+
 #endif // TRITON_IR_TYPES_H_
@@ -360,4 +360,17 @@ def TritonGPUCoalesceAsyncCopy: Pass<"tritongpu-coalesce-async-copy", "mlir::Mod
                            "mlir::triton::TritonDialect"];
 }
 
+def TritonGPUCanonicalize: Pass<"tritongpu-canonicalize"> {
+  let summary = "reduced set of simplifications for TTGIR";
+
+  let description = [{
+    The `tritongpu-canonicalize` pass applies a reduced set of simplification
+    and canonicalization patterns to the module.
+  }];
+  let dependentDialects = [
+    "mlir::arith::ArithDialect",
+    "mlir::scf::SCFDialect",
+  ];
+}
+
 #endif
@@ -1,16 +1,12 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Interfaces.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/TypeSwitch.h"
-#include "llvm/Support/raw_ostream.h"
 
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/IR/DialectImplementation.h"
-
-#include "mlir/Transforms/InliningUtils.h"
 #include "triton/Dialect/Triton/IR/AttrInterfaces.cpp.inc"
 #include "triton/Dialect/Triton/IR/Dialect.cpp.inc"
 #include "triton/Dialect/Triton/IR/OpInterfaces.cpp.inc"
@@ -22,62 +18,45 @@ using namespace mlir::triton;
 // TritonDialect Dialect Interfaces
 //===----------------------------------------------------------------------===//
 
-namespace {
-struct TritonInlinerInterface : public DialectInlinerInterface {
-  using DialectInlinerInterface::DialectInlinerInterface;
-
-  bool isLegalToInline(Operation *call, Operation *callable,
-                       bool wouldBeCloned) const final {
-    auto funcOp = dyn_cast<triton::FuncOp>(callable);
-    if (!funcOp)
-      return true;
-    if (funcOp->hasAttr("noinline"))
-      return !funcOp->getAttrOfType<BoolAttr>("noinline").getValue();
-    return true;
-  }
-
-  bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
-                       IRMapping &valueMapping) const final {
-    return true;
-  }
-
-  bool isLegalToInline(Operation *, Region *, bool wouldBeCloned,
-                       IRMapping &) const final {
+bool TritonInlinerInterface::isLegalToInline(Operation *call,
+                                             Operation *callable,
+                                             bool wouldBeCloned) const {
+  auto funcOp = dyn_cast<triton::FuncOp>(callable);
+  if (!funcOp)
     return true;
-  }
-  //===--------------------------------------------------------------------===//
-  // Transformation Hooks
-  //===--------------------------------------------------------------------===//
-
-  /// Handle the given inlined terminator by replacing it with a new operation
-  /// as necessary.
-  void handleTerminator(Operation *op, Block *newDest) const final {
-    // Only return needs to be handled here.
-    auto returnOp = dyn_cast<triton::ReturnOp>(op);
-    if (!returnOp)
-      return;
-
-    // Replace the return with a branch to the dest.
-    OpBuilder builder(op);
-    builder.create<mlir::cf::BranchOp>(op->getLoc(), newDest,
-                                       returnOp.getOperands());
-    op->erase();
-  }
-
-  /// Handle the given inlined terminator by replacing it with a new operation
-  /// as necessary.
-  void handleTerminator(Operation *op, ValueRange valuesToRepl) const final {
-    // Only return needs to be handled here.
-    auto returnOp = cast<triton::ReturnOp>(op);
+  if (funcOp->hasAttr("noinline"))
+    return !funcOp->getAttrOfType<BoolAttr>("noinline").getValue();
+  return true;
+}
 
-    // Replace the values directly with the return operands.
-    assert(returnOp.getNumOperands() == valuesToRepl.size());
-    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
-      valuesToRepl[it.index()].replaceAllUsesWith(it.value());
-  }
-};
+/// Handle the given inlined terminator by replacing it with a new operation
+/// as necessary.
+void TritonInlinerInterface::handleTerminator(Operation *op,
+                                              Block *newDest) const {
+  // Only return needs to be handled here.
+  auto returnOp = dyn_cast<triton::ReturnOp>(op);
+  if (!returnOp)
+    return;
+
+  // Replace the return with a branch to the dest.
+  OpBuilder builder(op);
+  builder.create<mlir::cf::BranchOp>(op->getLoc(), newDest,
+                                     returnOp.getOperands());
+  op->erase();
+}
 
-} // namespace
+/// Handle the given inlined terminator by replacing it with a new operation
+/// as necessary.
+void TritonInlinerInterface::handleTerminator(Operation *op,
+                                              ValueRange valuesToRepl) const {
+  // Only return needs to be handled here.
+  auto returnOp = cast<triton::ReturnOp>(op);
+
+  // Replace the values directly with the return operands.
+  assert(returnOp.getNumOperands() == valuesToRepl.size());
+  for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+    valuesToRepl[it.index()].replaceAllUsesWith(it.value());
+}
 
 void TritonDialect::initialize() {
   registerTypes();
 
@@ -10,6 +10,7 @@
 
 #include "mlir/Support/LLVM.h"
 #include "triton/Analysis/Utility.h"
+#include "triton/Dialect/Triton/IR/Interfaces.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -3119,6 +3120,7 @@ void TritonGPUDialect::initialize() {
 #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc"
 #include "triton/Dialect/TritonGPU/IR/OpsEnums.cpp.inc"
       >();
+  addInterfaces<TritonInlinerInterface>();
   addInterfaces<TritonGPUOpAsmInterface>();
   addInterfaces<TritonGPUInferLayoutInterface>();
   addInterfaces<TritonGPUVerifyTensorLayoutInterface>();
 
@@ -1,5 +1,6 @@
 add_triton_library(TritonGPUTransforms
   AccelerateMatmul.cpp
+  Canonicalize.cpp
   Coalesce.cpp
   F32DotTC.cpp
   FuseNestedLoops.cpp
 
@@ -0,0 +1,46 @@
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+
+using namespace mlir;
+using namespace triton;
+namespace ttg = triton::gpu;
+namespace ttng = triton::nvidia_gpu;
+
+namespace mlir::triton::gpu {
+#define GEN_PASS_DEF_TRITONGPUCANONICALIZE
+#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
+} // namespace mlir::triton::gpu
+
+namespace {
+struct Canonicalize
+    : public ttg::impl::TritonGPUCanonicalizeBase<Canonicalize> {
+  void runOnOperation() override;
+};
+} // namespace
+
+void Canonicalize::runOnOperation() {
+  MLIRContext *ctx = &getContext();
+  RewritePatternSet patterns(&getContext());
+
+  // Populate `arith` and `scf` canonicalizers.
+  ctx->getLoadedDialect<arith::ArithDialect>()->getCanonicalizationPatterns(
+      patterns);
+  ctx->getLoadedDialect<scf::SCFDialect>()->getCanonicalizationPatterns(
+      patterns);
+  populateForOpDeadArgumentElimination(patterns);
+
+  // Populate select Triton canonicalization patterns. The important patterns to
+  // EXCLUDE are those that modify layouts, especially `ConvertLayoutOp`
+  // patterns.
+  LoadOp::getCanonicalizationPatterns(patterns, ctx);
+  StoreOp::getCanonicalizationPatterns(patterns, ctx);
+  BroadcastOp::getCanonicalizationPatterns(patterns, ctx);
+  ExpandDimsOp::getCanonicalizationPatterns(patterns, ctx);
+  ttg::WarpSpecializeOp::getCanonicalizationPatterns(patterns, ctx);
+  ttng::TensorDescToTMAPtrOp::getCanonicalizationPatterns(patterns, ctx);
+}
@@ -6,6 +6,7 @@
 #include "mlir/IR/Types.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Types.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
 using namespace mlir;
@@ -80,6 +81,17 @@ void init_gluon_ir(py::module &&m) {
                  ctx, swizzleByteWidth, transposed, elementBitwidth, fp4Padded,
                  ctaLayout);
            })
+      .def("get_swizzled_shared_layout",
+           [](GluonOpBuilder &self, int vec, int perPhase, int maxPhase,
+              std::vector<unsigned> &order, std::vector<unsigned> &ctasPerCga,
+              std::vector<unsigned> &ctaSplitNum,
+              std::vector<unsigned> &ctaOrder) -> Attribute {
+             auto ctx = self.getContext();
+             auto ctaLayout = ttg::CTALayoutAttr::get(ctx, ctasPerCga,
+                                                      ctaSplitNum, ctaOrder);
+             return ttg::SwizzledSharedEncodingAttr::get(
+                 ctx, vec, perPhase, maxPhase, order, ctaLayout);
+           })
       .def("get_tensor_memory_layout",
            [](GluonOpBuilder &self, std::vector<unsigned> &block, bool unpacked,
               std::vector<unsigned> &ctaSplitNum) -> Attribute {
@@ -94,6 +106,10 @@ void init_gluon_ir(py::module &&m) {
            [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
              return self.create<ttg::ConvertLayoutOp>(resultTy, value);
            })
+      .def("create_local_alloc",
+           [](GluonOpBuilder &self, Type resultTy) -> Value {
+             return self.create<ttg::LocalAllocOp>(resultTy);
+           })
       .def("create_local_alloc",
            [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
              return self.create<ttg::LocalAllocOp>(resultTy, value);
@@ -106,10 +122,19 @@ void init_gluon_ir(py::module &&m) {
            [](GluonOpBuilder &self, Type resultTy, Value memDesc) -> Value {
              return self.create<ttg::LocalLoadOp>(resultTy, memDesc);
            })
+      .def("create_local_dealloc",
+           [](GluonOpBuilder &self, Value memDesc) -> Operation * {
+             return self.create<ttg::LocalDeallocOp>(memDesc);
+           })
+
       .def("create_tmem_alloc",
            [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
              return self.create<ttng::TMEMAllocOp>(resultTy, value);
            })
+      .def("create_tmem_alloc",
+           [](GluonOpBuilder &self, Type resultTy, py::none value) -> Value {
+             return self.create<ttng::TMEMAllocOp>(resultTy, Value{});
+           })
       .def("create_tmem_store",
            [](GluonOpBuilder &self, Value memDesc, Value value, Value pred) {
              self.create<ttng::TMEMStoreOp>(memDesc, value, pred);
@@ -123,6 +148,38 @@ void init_gluon_ir(py::module &&m) {
               int N) -> Value {
              return self.create<ttng::TMEMSubSliceOp>(resultTy, memDesc, N);
            })
+      .def("create_mbarrier_init",
+           [](GluonOpBuilder &self, Value memDesc, int count) {
+             self.create<ttng::InitBarrierOp>(memDesc, count);
+           })
+      .def("create_mbarrier_inval",
+           [](GluonOpBuilder &self, Value memDesc) {
+             self.create<ttng::InvalBarrierOp>(memDesc);
+           })
+      .def("create_mbarrier_expect",
+           [](GluonOpBuilder &self, Value memDesc, int bytes, Value pred) {
+             self.create<ttng::BarrierExpectOp>(memDesc, bytes, pred);
+           })
+      .def("create_mbarrier_wait",
+           [](GluonOpBuilder &self, Value memDesc, Value phase, Value pred,
+              std::vector<Value> &deps) {
+             self.create<ttng::WaitBarrierOp>(memDesc, phase, pred, deps);
+           })
+      .def("create_mbarrier_arrive",
+           [](GluonOpBuilder &self, Value memDesc, int count, Value pred) {
+             self.create<ttng::ArriveBarrierOp>(memDesc, count, pred);
+           })
+      .def("create_tcgen05_mma",
+           [](GluonOpBuilder &self, Value a, Value b, Value acc, Value useAcc,
+              Value pred, std::vector<Value> &mbarriers,
+              std::vector<Value> &mbarrier_preds) {
+             Value accDep;
+             bool two_ctas = false;
+             auto tokType = self.getBuilder().getType<ttg::AsyncTokenType>();
+             self.create<ttng::TCGen5MMAOp>(tokType, a, b, acc, accDep, useAcc,
+                                            pred, two_ctas, mbarriers,
+                                            mbarrier_preds);
+           })
       .def("create_warp_return",
            [](GluonOpBuilder &self) -> Operation * {
              return self.create<ttg::WarpReturnOp>();
 
@@ -51,6 +51,7 @@ void init_triton_passes_ttir(py::module &&m) {
 }
 
 void init_triton_passes_ttgpuir(py::module &&m) {
+  using namespace mlir;
   using namespace mlir::triton::gpu;
   ADD_PASS_WRAPPER_0("add_coalesce", createTritonGPUCoalesce);
   ADD_PASS_WRAPPER_0("add_optimize_thread_locality",
@@ -85,6 +86,12 @@ void init_triton_passes_ttgpuir(py::module &&m) {
   ADD_PASS_WRAPPER_0("add_fuse_nested_loops", createTritonGPUFuseNestedLoops);
   ADD_PASS_WRAPPER_0("add_coalesce_async_copy",
                      createTritonGPUCoalesceAsyncCopy);
+  ADD_PASS_WRAPPER_0("add_canonicalizer", createTritonGPUCanonicalize);
+  ADD_PASS_WRAPPER_0("add_inliner", [] {
+    return createInlinerPass(/*opPipelines=*/{}, [](OpPassManager &pm) {
+      pm.addPass(createTritonGPUCanonicalize());
+    });
+  });
 }
 
 void init_triton_passes_convert(py::module &&m) {
 
@@ -94,7 +94,7 @@ def __init__(self, device_type: str) -> None:
         self.driver = ExtensionDriver()
         self.version_key = None
 
-    def add_stages(self, arch, extern_libs, stages):
+    def add_stages(self, stages, options, language):
         filter_in_stages = ["ast", "ttir", "ttgir"]
         filter_out_stages = []
         for key, _ in stages.items():