Make fuse reshape with load op deterministic (#5327)

anmyachev · web-flow · commit 424f98bfce25 · 2025-10-16T16:02:00.000+02:00
Similar to #4323. The main idea is to get rid of structures where pointers are sorted. Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
diff --git a/test/Triton/Intel/FuseReshape/fuse-reshape.mlir b/test/Triton/Intel/FuseReshape/fuse-reshape.mlir
@@ -185,12 +185,12 @@ tt.func public @fuseLoadWithReshape4(%arg0: i32, %arg1: !tt.ptr<f16>, %arg2: !tt
 // CHECK: [[ADD22:%.*]] = arith.addi [[MUL22]], %c1_i32 : i32
 // CHECK: [[PTR2:%.*]] = tt.make_tensor_ptr %arg2, [[[ADD12]], %c64_i64], [%c64_i64, %c1_i64], [[[ADD22]], %c2_i32] {order = array<i32: 1, 0>} : <tensor<32x64xf16>>
 // CHECK: scf.for
-// CHECK:   [[ADV:%.*]] = tt.advance [[PTR2]], {{.*}} : <tensor<32x64xf16>>
+// CHECK:   [[ADV:%.*]] = tt.advance [[PTR1]], {{.*}} : <tensor<32x64xf16>>
 // CHECK:   [[LOAD_B1:%.*]] = tt.load [[ADV]] : !tt.ptr<tensor<32x64xf16>>
 // CHECK:   tt.dot {{.*}}, [[LOAD_B1]], {{.*}}, inputPrecision = tf32 : tensor<64x32xf16> * tensor<32x64xf16> -> tensor<64x64xf32>
 // CHECK:   scf.yield
 // CHECK: scf.for
-// CHECK:   [[ADV:%.*]] = tt.advance [[PTR1]], {{.*}} : <tensor<32x64xf16>>
+// CHECK:   [[ADV:%.*]] = tt.advance [[PTR2]], {{.*}} : <tensor<32x64xf16>>
 // CHECK:   [[LOAD_B1:%.*]] = tt.load [[ADV]] : !tt.ptr<tensor<32x64xf16>>
 // CHECK:   tt.dot {{.*}}, [[LOAD_B1]], {{.*}}, inputPrecision = tf32 : tensor<64x32xf16> * tensor<32x64xf16> -> tensor<64x64xf32>
 // CHECK:   scf.yield
diff --git a/third_party/intel/include/Utils/DefUseChain.h b/third_party/intel/include/Utils/DefUseChain.h
@@ -3,6 +3,7 @@
 
 #include "mlir/IR/Value.h"
 #include "llvm/ADT/SetVector.h"
+#include <unordered_set>
 
 namespace mlir::triton::intel {
 
@@ -20,12 +21,6 @@ class DefUseChain {
 
   DefUseChain() = delete;
 
-  bool operator<(const DefUseChain &other) const {
-    if (start == other.start)
-      return end < other.end;
-    return start < other.start;
-  }
-
   bool operator==(const DefUseChain &other) const { return ops == other.ops; }
 
   const Operations &getOps() const { return ops; }
@@ -61,13 +56,19 @@ class DefUseChain {
   Operation *end;   //< last operation in the chain
 };
 
+struct DefUseChainHash {
+  size_t operator()(const mlir::triton::intel::DefUseChain &c) const noexcept {
+    return llvm::hash_combine(c.getStart(), c.getEnd());
+  }
+};
+
 /// \class DefUseChainManager
 /// Manages collection of one or more \class DefUseChain.
 class DefUseChainManager {
   friend raw_ostream &operator<<(raw_ostream &, const DefUseChainManager &);
 
 public:
-  using DefUseChains = std::set<DefUseChain>;
+  using DefUseChains = std::unordered_set<DefUseChain, DefUseChainHash>;
   using Operations = DefUseChain::Operations;
 
   /// Create all def-use chains rooted at \p start and terminated by \p end.
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeDotOperands.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeDotOperands.cpp
@@ -111,7 +111,7 @@ class FuseTransWithLoad {
 private:
   // Duplicate the root operation of the given chains.
   void duplicateRoot(DefUseChains &chains) const {
-    std::map<Operation *, DefUseChains> rootToChains;
+    std::unordered_map<Operation *, DefUseChains> rootToChains;
     for (const DefUseChain &chain : chains) {
       Operation *start = chain.getStart();
       if (!rootToChains[start].empty())