[Gluon] Fix auto_encoding for ops which may infer multiple layouts (#7718)

peterbell10 · web-flow · commit c24d86c5974f · 2025-07-31T22:22:18.000Z
In the failing example we have:
```mlir
    %0 = tt.make_range {end = 8192 : i32, start = 0 : i32} : tensor&lt;8192xi32, #gluon.auto_encoding&gt;
    %1 = tt.reshape %0 : tensor&lt;8192xi32, #gluon.auto_encoding&gt; -&gt; tensor&lt;64x128xi32, #gluon.auto_encoding&gt;
    %2 = gluon.set_auto_layout %1 : tensor&lt;64x128xi32, #gluon.auto_encoding&gt; -&gt; tensor&lt;64x128xi32, #blocked&gt;
```
which currently fails with the error:
```python
/root/code/triton/test.py:43:52: error: 'tt.reshape' op Found conflicting encodings for value
            gl.arange(0, BLOCK_M * BLOCK_N).reshape((BLOCK_M, BLOCK_N)),
```

The issue is that we propagate the blocked layout backwards to get a
linear layout for the `make_range` result, then the algorithm propagates
that layout forward to the `reshape` result. However, it infers a linear
layout and errors because it conflicts with the original blocked layout.

I fix this by setting a `mayVary` flag when an encoding comes from an
inference result that isn't the only possibility. I then have special
rules that resolve conflicts where one or more of the encodings is
allowed to vary.
diff --git a/lib/Dialect/Gluon/Transforms/ResolveAutoEncodings.cpp b/lib/Dialect/Gluon/Transforms/ResolveAutoEncodings.cpp
@@ -1,3 +1,4 @@
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Support/LLVM.h"
@@ -9,6 +10,8 @@
 #include "llvm/ADT/PriorityWorklist.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/LogicalResult.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/xxhash.h"
 
 namespace ttg = mlir::triton::gpu;
 
@@ -28,6 +31,65 @@ bool isAutoEncodingTensorType(Type ty) {
   return tensorTy && isa<gluon::AutoEncodingAttr>(tensorTy.getEncoding());
 }
 
+struct LayoutInfo {
+  Attribute encoding;
+  // Some operations can infer one of many encodings,
+  // we model this by setting the mayVary flag on encodings
+  // derived from these ops.
+  // If "may vary" is set then we allow conflicts, and when
+  // resolving conflicts we prefer encodings that are not allowed to vary.
+  bool mayVary = false;
+
+  operator bool() { return bool(encoding); }
+};
+
+uint64_t hashWithMemo(Attribute attr,
+                      llvm::MapVector<Attribute, uint64_t> &hashMemo) {
+  auto it = hashMemo.find(attr);
+  if (it != hashMemo.end()) {
+    return it->second;
+  }
+
+  // llvm::hash_value is not stable, so instead we hash the string repr of the
+  // attribute
+  std::string str;
+  llvm::raw_string_ostream os(str);
+  attr.print(os);
+  auto hash = llvm::xxh3_64bits(str);
+  hashMemo.try_emplace(attr, hash);
+  return hash;
+}
+
+bool compare(Attribute a, Attribute b,
+             llvm::MapVector<Attribute, uint64_t> &hashMemo) {
+  if (a == b)
+    return false;
+
+  return hashWithMemo(a, hashMemo) > hashWithMemo(b, hashMemo);
+}
+
+LayoutInfo combineInfo(LayoutInfo lhs, LayoutInfo rhs, Operation *op,
+                       llvm::MapVector<Attribute, uint64_t> &hashMemo) {
+  // Sort inputs so this operation is commutative
+  if (compare(lhs.encoding, rhs.encoding, hashMemo)) {
+    std::swap(lhs, rhs);
+  }
+  if (lhs.mayVary)
+    return rhs;
+  if (rhs.mayVary)
+    return lhs;
+  if (lhs.encoding == rhs.encoding)
+    return lhs;
+  op->emitOpError("found conflicting encodings for value:\n  ")
+      << lhs.encoding << "\nand\n  " << rhs.encoding;
+  return {};
+}
+
+bool encodingsMayVary(Operation *op) {
+  return isa<triton::JoinOp, triton::SplitOp, triton::ReshapeOp, triton::CatOp,
+             triton::TransOp>(op);
+}
+
 LogicalResult inferAutoLayouts(FuncOp func) {
   // Disallow auto encoding accross function call boundaries
   for (auto argTy : func.getArgumentTypes()) {
@@ -42,33 +104,37 @@ LogicalResult inferAutoLayouts(FuncOp func) {
           "Functions returning auto encoding must be fully inlined");
   }
 
-  llvm::MapVector<Value, Attribute> valueToEncoding;
+  llvm::MapVector<Value, LayoutInfo> valueToEncoding;
   llvm::PriorityWorklist<Value> worklist;
+  llvm::MapVector<Attribute, uint64_t> hashMemo;
 
   auto updateEncoding = [&](ArrayRef<Value> values,
-                            Attribute enc) -> LogicalResult {
+                            LayoutInfo info) -> LogicalResult {
     for (auto value : values) {
-      auto [it, inserted] = valueToEncoding.insert({value, enc});
+      auto [it, inserted] = valueToEncoding.insert({value, info});
       if (!inserted) {
-        if (it->second != enc) {
-          auto defOp = value.getDefiningOp();
-          auto op = defOp ? defOp : func;
-          return op->emitOpError("Found conflicting encodings for value");
-        }
-      } else {
-        LLVM_DEBUG({
-          DBGS() << "Setting value:\n\t" << value << "\nto encoding:\n\t" << enc
-                 << "\n";
-        });
-        worklist.insert(value);
+        auto defOp = value.getDefiningOp();
+        auto op = defOp ? defOp : func;
+        auto combine = combineInfo(it->second, info, op, hashMemo);
+        if (!combine)
+          return failure();
+        if (combine == it->second)
+          continue;
+        it->second = combine;
       }
+      LLVM_DEBUG({
+        DBGS() << "Setting value:\n\t" << value << "\nto encoding:\n\t"
+               << it->second << "\n";
+      });
+      worklist.insert(value);
     }
     return success();
   };
 
   // 1. Set seed values from set_auto_layout ops
   auto res = func.walk([&](gluon::SetAutoLayoutOp op) -> WalkResult {
-    return updateEncoding({op.getSrc()}, op.getType().getEncoding());
+    return updateEncoding({op.getSrc()},
+                          LayoutInfo{op.getType().getEncoding()});
   });
 
   if (res.wasInterrupted())
@@ -77,26 +143,28 @@ LogicalResult inferAutoLayouts(FuncOp func) {
   // 2. Propagate encodings through the graph until fixed point, or conflict
   while (!worklist.empty()) {
     auto val = worklist.pop_back_val();
-    auto enc = valueToEncoding[val];
-    assert(enc);
+    auto info = valueToEncoding[val];
+    assert(info);
 
     // Propagate to users
     for (OpOperand &use : val.getUses()) {
       auto op = use.getOwner();
       if (isa<scf::ForOp, scf::WhileOp>(op)) {
         auto offset = 3 * isa<scf::ForOp>(op);
         auto tiedArgs = getTiedArgs(op, use.getOperandNumber() - offset);
-        if (failed(updateEncoding(tiedArgs, enc)))
+        if (failed(updateEncoding(tiedArgs, info)))
           return failure();
       } else if (isa<scf::YieldOp>(op)) {
         auto tiedArgs = getTiedArgs(op, use.getOperandNumber());
-        if (failed(updateEncoding(tiedArgs, enc)))
+        if (failed(updateEncoding(tiedArgs, info)))
           return failure();
       } else {
-        auto dstEnc = inferDstEncoding(op, enc);
+        auto dstEnc = inferDstEncoding(op, info.encoding);
         if (dstEnc) {
+          bool mayVary = info.mayVary || encodingsMayVary(op);
+          LayoutInfo dstInfo{dstEnc, mayVary};
           if (failed(updateEncoding(llvm::to_vector_of<Value>(op->getResults()),
-                                    dstEnc)))
+                                    dstInfo)))
             return failure();
         }
       }
@@ -107,17 +175,19 @@ LogicalResult inferAutoLayouts(FuncOp func) {
       auto definingOp = opResult.getOwner();
       if (isa<scf::ForOp, scf::WhileOp, scf::IfOp>(definingOp)) {
         auto tiedArgs = getTiedArgs(definingOp, opResult.getResultNumber());
-        if (failed(updateEncoding(tiedArgs, enc)))
+        if (failed(updateEncoding(tiedArgs, info)))
           return failure();
       } else {
-        auto srcEncoding = inferSrcEncoding(definingOp, enc);
+        auto srcEncoding = inferSrcEncoding(definingOp, info.encoding);
         if (srcEncoding) {
+          bool mayVary = info.mayVary || encodingsMayVary(definingOp);
+          LayoutInfo srcInfo{srcEncoding, mayVary};
           llvm::SmallVector<Value> tensorOperands;
           for (auto operand : definingOp->getOperands())
             if (isa<RankedTensorType>(operand.getType()))
               tensorOperands.push_back(operand);
 
-          if (failed(updateEncoding(tensorOperands, srcEncoding)))
+          if (failed(updateEncoding(tensorOperands, srcInfo)))
             return failure();
         }
       }
@@ -126,18 +196,18 @@ LogicalResult inferAutoLayouts(FuncOp func) {
       if (isa<scf::ForOp, scf::WhileOp>(parentOp)) {
         auto offset = isa<scf::ForOp>(parentOp);
         auto tiedArgs = getTiedArgs(parentOp, blockArg.getArgNumber() - offset);
-        if (failed(updateEncoding(tiedArgs, enc)))
+        if (failed(updateEncoding(tiedArgs, info)))
           return failure();
       }
     }
   }
 
   // 3. Transfer propagated encodings into the graph
   auto ctx = func.getContext();
-  for (auto &[val, enc] : valueToEncoding) {
+  for (auto &[val, info] : valueToEncoding) {
     auto existingTy = cast<RankedTensorType>(val.getType());
     assert(isa<gluon::AutoEncodingAttr>(existingTy.getEncoding()));
-    auto ty = existingTy.cloneWithEncoding(enc);
+    auto ty = existingTy.cloneWithEncoding(info.encoding);
     val.setType(ty);
 
     if (auto opResult = dyn_cast<OpResult>(val)) {
diff --git a/python/triton/language/__init__.py b/python/triton/language/__init__.py
@@ -84,7 +84,7 @@
     join,
     load,
     make_block_ptr,
-    map_elementwise,  # noqa
+    map_elementwise,
     max_constancy,
     max_contiguous,
     maximum,
@@ -209,6 +209,7 @@
     "log",
     "log2",
     "make_block_ptr",
+    "map_elementwise",
     "math",
     "max",
     "max_constancy",
diff --git a/test/Gluon/auto_encoding.mlir b/test/Gluon/auto_encoding.mlir
@@ -131,3 +131,20 @@ module attributes {ttg.maxnreg = 128 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-w
     tt.return %3 : tensor<128x128xi32, #blocked>
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 64], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [0, 1]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @_tmem_col_slice_load(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) -> tensor<64x128xi32, #blocked> {
+    // CHECK-DAG: [[BLOCKED:#.*]] = #ttg.blocked<{sizePerThread = [1, 64], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [0, 1]}>
+    // CHECK-DAG: [[LINEAR:#.*]] = #ttg.linear
+    // CHECK: [[RANGE:%.*]] = tt.make_range {end = 8192 : i32, start = 0 : i32} : tensor<8192xi32, [[LINEAR]]>
+    // CHECK: [[RESHAPE:%.*]] = tt.reshape [[RANGE]] : tensor<8192xi32, [[LINEAR]]> -> tensor<64x128xi32, [[BLOCKED]]>
+    // CHECK: tt.return [[RESHAPE]] : tensor<64x128xi32, [[BLOCKED]]>
+    %0 = tt.make_range {end = 8192 : i32, start = 0 : i32} : tensor<8192xi32, #gluon.auto_encoding>
+    %1 = tt.reshape %0 : tensor<8192xi32, #gluon.auto_encoding> -> tensor<64x128xi32, #gluon.auto_encoding>
+    %2 = gluon.set_auto_layout %1 : tensor<64x128xi32, #gluon.auto_encoding> -> tensor<64x128xi32, #blocked>
+    tt.return %2 : tensor<64x128xi32, #blocked>
+  }
+}
diff --git a/test/Gluon/invalid_auto_encoding.mlir b/test/Gluon/invalid_auto_encoding.mlir
@@ -5,7 +5,7 @@
 
 module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
   tt.func public @infer_conflict() -> (tensor<16xi32, #blocked>, tensor<16xi32, #blocked1>) {
-    // expected-error @+1 {{Found conflicting encodings for value}}
+    // expected-error-re @+1 {{found conflicting encodings for value:{{.*}}  #ttg.blocked<{sizePerThread = [1]{{.*}}and{{.*}}  #ttg.blocked<{sizePerThread = [2]}}
     %0 = arith.constant dense<7> : tensor<16xi32, #gluon.auto_encoding>
     %cvt1 = gluon.set_auto_layout %0 : tensor<16xi32, #gluon.auto_encoding> -> tensor<16xi32, #blocked>
     %cvt2 = gluon.set_auto_layout %0 : tensor<16xi32, #gluon.auto_encoding> -> tensor<16xi32, #blocked1>