[BACKEND] Generalise maybeDeduplicate to all layouts (#8492)

lezcano · web-flow · commit c172d539a2f4 · 2025-10-21T05:28:33.000-07:00
We had a subtle asymmetry here that was producing different PTX for the
same layout. We now generalise this pass to work with any layout and we
drop
a few restrictions the previous pass had.
diff --git a/include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h b/include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h
@@ -57,6 +57,7 @@ class ElementwiseOpConversionBase : public ConvertOpToLLVMPattern<SourceOp> {
   // computation is eliminated.
   SmallVector<Value> maybeDeduplicate(SourceOp op,
                                       SmallVector<Value> resultVals) const {
+    auto ctx = op.getContext();
     if (!isMemoryEffectFree(op))
       // the op has side effects: can't dedup
       return resultVals;
@@ -65,104 +66,45 @@ class ElementwiseOpConversionBase : public ConvertOpToLLVMPattern<SourceOp> {
       // there must be exactly 1 result
       return resultVals;
     Value result = results[0];
-    Type type = result.getType();
-    if (!type)
-      return resultVals;
-    RankedTensorType rtType = dyn_cast<RankedTensorType>(type);
+    RankedTensorType rtType = dyn_cast<RankedTensorType>(result.getType());
     if (!rtType)
       // the result must be a tensor
       return resultVals;
-    Attribute encoding = rtType.getEncoding();
-    if (!encoding)
-      // encoding not available
-      return resultVals;
-    Attribute baseEncoding = encoding;
-    if (isa<AMDMfmaEncodingAttr>(baseEncoding) ||
-        isa<AMDWmmaEncodingAttr>(baseEncoding))
-      // TODO: this logic seems incorrect for mfma and wmma layout. Skip for
-      // now. We saw mismatches for some flash-attention and dot tests on AMD
-      // backend. Note that this logic works for sliced layout whose parent is
-      // mfma layout. Therefore, this is not combined with the following check.
-      return resultVals;
-    while (auto sliced = dyn_cast<SliceEncodingAttr>(baseEncoding))
-      baseEncoding = sliced.getParent();
-    if (isa<LinearEncodingAttr, DotOperandEncodingAttr>(baseEncoding)) {
-      // TODO: this logic seems incorrect for mma layout. Skip for now.
-      // The following test crashes and some other miscompile:
-      // test_core::test_fp8_dot_acc
-      return resultVals;
-    }
 
-    SmallVector<unsigned> elemsPerThread = getElemsPerThread(rtType);
-    int rank = elemsPerThread.size();
-    if (product<unsigned>(elemsPerThread) != resultVals.size())
-      return resultVals;
+    // Bail out if we don't have the constancy analysis
     AxisInfo *axisInfo = axisAnalysisPass.getAxisInfo(result);
     if (!axisInfo)
-      // axis info (e.g., constancy) not available
-      return resultVals;
-    SmallVector<unsigned> contigPerThread = getContigPerThread(rtType);
-    if (rank != contigPerThread.size())
       return resultVals;
-
     SmallVector<int64_t> constancy = axisInfo->getConstancy();
-    if (rank != constancy.size())
-      return resultVals;
-    bool hasConstancy = false;
-    for (int i = 0; i < rank; ++i) {
-      if (constancy[i] > contigPerThread[i]) {
-        if (constancy[i] % contigPerThread[i] != 0)
-          // constancy is not evenly covered by contigPerThread
-          return resultVals;
-        // can't move the values across different
-        // "contigPerThread"-sized blocks
-        constancy[i] = contigPerThread[i];
-      }
-      if (elemsPerThread[i] < 1 || constancy[i] < 1)
-        return resultVals;
-      if (!(elemsPerThread[i] % constancy[i] == 0 ||
-            constancy[i] % elemsPerThread[i] == 0))
-        // either the constancy along each dimension must fit
-        // into the elemsPerThread or the other way around
-        return resultVals;
-      if (constancy[i] > 1)
-        hasConstancy = true;
-    }
-    if (!hasConstancy)
-      // nothing to deduplicate
-      return resultVals;
 
-    if (rank > 1) {
-      // reorder the shape and constancy vectors by the axis order:
-      // from the fastest-changing to the smallest-changing axis
-      SmallVector<unsigned> order = getOrder(rtType);
-      if (rank != order.size())
-        return resultVals;
-      elemsPerThread = applyPermutation(elemsPerThread, order);
-      constancy = applyPermutation(constancy, order);
-    }
+    if (llvm::all_of(constancy, [](int64_t c) { return c == 1; }))
+      return resultVals;
 
-    SmallVector<unsigned> strides(rank, 1);
-    for (int i = 1; i < rank; ++i) {
-      strides[i] = strides[i - 1] * elemsPerThread[i - 1];
-    }
-    SmallVector<Value> dedupResultVals;
-    dedupResultVals.reserve(resultVals.size());
-    for (int i = 0; i < resultVals.size(); ++i) {
-      // each coordinate of the orig_idx is "coarsened" using the
-      // constancy along this dimension: the resulting dedup_idx
-      // points to the reused value in the original resultsVal
-      int orig_idx = i;
-      int dedup_idx = 0;
-      for (int j = 0; j < rank; ++j) {
-        int coord_j = orig_idx % elemsPerThread[j];
-        dedup_idx += (coord_j / constancy[j] * constancy[j]) * strides[j];
-        orig_idx /= elemsPerThread[j];
+    // We zero out the bases that are constant
+    auto kReg = StringAttr::get(ctx, "register");
+    auto ll = toLinearLayout(rtType);
+    auto dims = to_vector(ll.getOutDimNames());
+    auto llReg = ll.sublayout({kReg}, dims);
+    auto inv = ll.pseudoinvert();
+    auto invReg = inv.sublayout(dims, {kReg});
+    auto bases_inv = invReg.getBases();
+    for (auto [c, d] : llvm::zip(constancy, dims)) {
+      assert(llvm::isPowerOf2_32(c));
+      for (int i = 0; i < llvm::Log2_32(c); i++) {
+        bases_inv[d][i] = {0};
       }
-      dedupResultVals.push_back(resultVals[dedup_idx]);
     }
-
-    return dedupResultVals;
+    auto invBroadcast =
+        LinearLayout(bases_inv, invReg.getOutDims(), /*isSurjective=*/false);
+    auto cvt = llReg.compose(invBroadcast);
+
+    // Deduplicate the result values
+    SmallVector<Value> outVals(resultVals.size());
+    for (int i = 0; i < outVals.size(); i++) {
+      auto srcIdx = cvt.apply({{kReg, i}}).begin()->second;
+      outVals[i] = resultVals[srcIdx];
+    }
+    return outVals;
   }
   LogicalResult
   matchAndRewrite(SourceOp op, OpAdaptor adaptor,
diff --git a/test/Conversion/amd/dedup-by-constancy.mlir b/test/Conversion/amd/dedup-by-constancy.mlir
@@ -1,18 +1,13 @@
 // RUN: triton-opt %s --convert-triton-amdgpu-to-llvm=arch=gfx942 --convert-builtin-func-to-llvm | FileCheck %s
 
 // CHECK-LABEL: dedup_by_constancy_mfma
-// CHECK-COUNT-4: llvm.icmp "slt"
+// CHECK-COUNT-2: llvm.icmp "slt"
 // CHECK-NOT: llvm.icmp "slt"
-// Here is why we expect exactly 4 icmp:
 // For a 32x32 tensor A with mfma layout, each thread holds 16 elements, which are divided
 // into 4 groups. E.g. thread 0 holds elements A[0:3,0], A[8:11,0], A[16:19,0], and A[24:27,0].
 // In this example, constancy of the tensor is 16 for dim 0, meaning A[0:15,0] have same values
 // and A[16:31,0] have same values. Therefore, for thread 0, the first 8 elements are duplicated
-// and the last 8 elements are duplicated. Ideally, thread 0 only needs two icmp, one for the
-// first 8 elements and the other for the last 8 elements. In practice, the dedup analysis
-// only allows duplication within each group of 4 elemnets. Therefore, we expect 4 icmp, one
-// for each group of 4 elements.
-// In the future, we can reduce the icmp to 2 in such case.
+// and the last 8 elements are duplicated.
 #mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [1, 1], instrShape = [32, 32, 8], isTransposed = false}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
   tt.func public @dedup_by_constancy_mfma(%arg0: i32 {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {