intel
diff --git a/‎test/TritonIntelGPU/dot-operands.mlir‎
Lines changed: 1 addition & 0 deletions b/‎test/TritonIntelGPU/dot-operands.mlir‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎third_party/intel/include/Dialect/Triton/Transforms/Passes.td‎
Lines changed: 2 additions & 2 deletions b/‎third_party/intel/include/Dialect/Triton/Transforms/Passes.td‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎third_party/intel/include/Utils/DefUseChain.h‎
Lines changed: 68 additions & 0 deletions b/‎third_party/intel/include/Utils/DefUseChain.h‎
Lines changed: 68 additions & 0 deletions
@@ -356,6 +356,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, "ttg.th
 }
 
 // -----
+
 #linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 8], [16, 0], [0, 16], [0, 32]], lane = [[1, 0], [2, 0], [4, 0], [8, 0]], warp = [[0, 0], [0, 0]], block = []}>
 #mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32} {
 
@@ -55,14 +55,14 @@ def TritonIntelFuseReshape
                                   {order = array<i32: 2, 1, 0>} : <tensor<1x512x64xf16>>
         %load = tt.load %ptr {boundaryCheck = array<i32: 2>} : !tt.ptr<tensor<1x512x64xf16>>
         %A = tt.reshape %load : tensor<1x512x64xf16> -> tensor<512x64xf16>
-        %dot %A, ... : tensor<512x64xf16> x tensor<64x32xf16> -> tensor<512x32xf16>
+        dot %A, ... : tensor<512x64xf16> x tensor<64x32xf16> -> tensor<512x32xf16>
 
     The transformation drops the reshape operation, and generates:
         %div = %a / %b
         %ptr = tt.make_tensor_ptr %base_ptr, [%s0 * %div + %s1, %s2], [%b, %c], [%x * %div + %y, %z]
                                   {order = array<i32: 1, 0>} : <tensor<512x64xf16>>
         %A = tt.load %ptr {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<512x64xf16>>
-        %dot %A, ... : tensor<512x64xf16> x tensor<64x32xf16> -> tensor<512x32xf16>
+        dot %A, ... : tensor<512x64xf16> x tensor<64x32xf16> -> tensor<512x32xf16>
   }];
 
   let dependentDialects = [
 
@@ -1,7 +1,9 @@
 #ifndef TRITON_INTEL_UTILS_DEFUSECHAIN_H
 #define TRITON_INTEL_UTILS_DEFUSECHAIN_H
 
+#include "Utils/Utility.h"
 #include "mlir/IR/Value.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
 #include "llvm/ADT/SetVector.h"
 #include <unordered_set>
 
@@ -97,6 +99,72 @@ class DefUseChainManager {
   DefUseChains chains;
 };
 
+/// \class Fuser
+/// Abstract base class providing functionality to fuse operations within a
+/// set of def-use chains.
+class Fuser {
+protected:
+  SmallPtrSet<Operation *, 8> cleanUp;
+
+  virtual ~Fuser() {
+    if (!cleanUp.empty())
+      eraseOperations(cleanUp);
+  }
+
+  using DefUseChain = intel::DefUseChain;
+  using DefUseChainManager = intel::DefUseChainManager;
+  using DefUseChains = DefUseChainManager::DefUseChains;
+
+  // Delegate to derived classes details on which operations within a
+  // DefUseChain to fuse.
+  virtual void fuse(const DefUseChain &) = 0;
+
+  // Fuse operations in the given \p chains.
+  void fuse(const DefUseChains &chains);
+
+  // Duplicate the root operation of the given \p chains.
+  void duplicateRoot(DefUseChains &chains) const;
+
+  // Duplicate the root operation of \p sameRootChains and update \p chains.
+  void duplicateRoot(DefUseChains &sameRootChains, DefUseChains &chains) const;
+
+  // Prune \p chains that cannot be handled during fusion. For example,
+  // operations in the def-use chain should have a single user, except in
+  // special circumstances (e.g. the root operation of a chain might have more
+  // than one user).
+  void pruneInvalid(DefUseChains &chains) const;
+
+  // Determine whether all operations in the given def-use \p chain have a
+  // single user. Note: we allow an operation in the def-use chain to have an
+  // additional user if the operation is in a for loop, and the additional user
+  // is the loop yield operation, provided that the result yielded is not used
+  // after the loop. Example:
+  //   make_tensor_ptr -> advance -> load (OK)
+  //   make_tensor_ptr -> for init_arg -> advance -> load (OK)
+  //                                   -> yield (OK)
+  //   make_tensor_ptr -> for init_arg -> advance -> load (OK)
+  //                                              -> yield -> load (NOT OK)
+  //
+  bool validateChain(const DefUseChain &chain) const;
+
+  // Propagate \p newVal to operations in the given def-use \p chain.
+  void propagateToUsers(Value newVal, const DefUseChain &chain,
+                        IRMapping &mapping);
+
+  // Propagate \p newVal to users of \p origOp.
+  void propagateToUsers(Value newVal, Value origVal, Operation *origOp,
+                        Operation *sentinel, IRMapping &mapping);
+
+  // If \p user is not \p sentinel, propagate \p newVal to \p user. Otherwise
+  // terminate the propagation.
+  virtual void propagateToUser(Value newVal, Value origVal, Operation *user,
+                               Operation *sentinel, IRMapping &mapping) = 0;
+
+  // Propagate \p newVal to users of \p origOp in the given \p loop.
+  void propagateToLoop(Value newVal, Value origVal, LoopLikeOpInterface loopOp,
+                       Operation *sentinel, IRMapping &mapping);
+};
+
 } // namespace mlir::triton::intel
 
 #endif // TRITON_INTEL_UTILS_DEFUSECHAIN_H
Original file line number	Diff line number	Diff line change
`@@ -356,6 +356,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, "ttg.th`
`356`	`356`	`}`
`357`	`357`
`358`	`358`	`// -----`
	`359`	`+`
`359`	`360`	`#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 8], [16, 0], [0, 16], [0, 32]], lane = [[1, 0], [2, 0], [4, 0], [8, 0]], warp = [[0, 0], [0, 0]], block = []}>`
`360`	`361`	`#mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>`
`361`	`362`	`module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32} {`