[LLs] Tree-reduce the xor reduction in LLs codegen (#7816)

lezcano · web-flow · commit 4da227ce9cef · 2025-08-11T18:56:03.000+01:00
This should allow better scheduling at the expense of slightly higher
register pressure.
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -158,39 +158,54 @@ Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x) {
   SmallVector<int32_t> matrix = flatten(A.getBases().begin()->second);
   assert(matrix.size() == nCol);
 
-  // We iterate the matrix following the diagonals
-  // The idea here is that we want to generate code of the form:
-  // \xor_i (x & mask_i) << s_i
-  // where s_i may by positive or negative (left or right shift)
-  // The hope here (and we see it in codegen) is that LLVM can turn
-  // the xor into a sum and then the sum + LHS/RHS can be fused into a mad.lo
-  // Get the i-th diagonal
-  auto getMask = [&](int i) {
+  // Row-wise popcount to detect rows that appear exactly once across columns.
+  uint32_t rowsUnique = 0;
+  {
+    SmallVector<int> rowPopCnt(nRow, 0);
+    for (int c = 0; c < nCol; ++c) {
+      uint32_t colBits = matrix[c];
+      for (int r = 0; r < nRow; ++r) {
+        if (colBits & (1u << r))
+          ++rowPopCnt[r];
+      }
+    }
+    for (int r = 0; r < nRow; ++r) {
+      if (rowPopCnt[r] == 1)
+        rowsUnique |= 1u << r;
+    }
+  }
+
+  // We iterate the matrix following the diagonals and build
+  // (x & mask_i) << s_i terms. Prefer OR for diagonals whose rows are unique,
+  // then XOR everything else. This tends to encourage mad.lo codegen.
+  auto getMaskAndAllRowsUnique = [&](int i) -> std::pair<uint32_t, bool> {
     uint32_t mask = 0;
     int row = i < 0 ? -i : 0;
     int col = i < 0 ? 0 : i;
+    bool allRowsUnique = true;
     while (row < nRow && col < nCol) {
       uint32_t bitValue = (matrix[col] >> row) & 1u;
       mask |= bitValue << col;
+      allRowsUnique &= ((rowsUnique >> row) & 1u) == 1u;
       ++row;
       ++col;
     }
-    return mask;
+    return {mask, allRowsUnique};
   };
 
   uint32_t explicitCols = 0;
 
   {
     SmallVector<uint32_t> masks;
     for (int i = -nRow + 1; i < nCol; i++) {
-      masks.push_back(getMask(i));
+      masks.push_back(std::get<0>(getMaskAndAllRowsUnique(i)));
     }
     bool reachedFixedPoint = false;
     while (!reachedFixedPoint) {
       reachedFixedPoint = true;
       for (uint32_t m : masks) {
         uint32_t c = m & ~explicitCols;
-        if ((c != 0) && ((c & (c - 1)) == 0)) {
+        if (llvm::isPowerOf2_32(c)) {
           // found a single-element diagonal
           explicitCols |= c;
           reachedFixedPoint = false;
@@ -200,14 +215,21 @@ Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x) {
   }
 
   // handle any diagonals that have survived
-  Value ret = b.i32_val(0);
+  SmallVector<Value> ors;
+  SmallVector<Value> xors;
   for (int i = -nRow + 1; i < nCol; i++) {
-    auto mask = getMask(i) & ~explicitCols;
+    auto [mask, allRowsUnique] = getMaskAndAllRowsUnique(i);
+    mask &= ~explicitCols;
     if (mask == 0)
       continue;
     auto masked = b.and_(x, b.i32_val(mask));
-    ret = b.xor_(ret, i >= 0 ? Value(b.lshr(masked, b.i32_val(i)))
-                             : Value(b.shl(masked, b.i32_val(-i))));
+    auto shifted = i >= 0 ? Value(b.lshr(masked, b.i32_val(i)))
+                          : Value(b.shl(masked, b.i32_val(-i)));
+    if (allRowsUnique) {
+      ors.push_back(shifted);
+    } else {
+      xors.push_back(shifted);
+    }
   }
 
   // handle any explicit columns:
@@ -219,10 +241,35 @@ Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x) {
       int32_t basis = matrix[i];
       if (basis == 0)
         continue;
-      ret = b.xor_(ret, b.select(bit_is_zero, zero, b.i32_val(basis)));
+      auto select = b.select(bit_is_zero, zero, b.i32_val(basis));
+      if ((rowsUnique & basis) == basis) {
+        ors.push_back(select);
+      } else {
+        xors.push_back(select);
+      }
     }
   }
-  return ret;
+
+  auto treeReduce = [&](SmallVector<Value> &terms,
+                        std::function<Value(Value, Value)> op) -> Value {
+    if (terms.empty())
+      return b.i32_val(0);
+    while (terms.size() > 1) {
+      SmallVector<Value> next;
+      for (size_t i = 0; i + 1 < terms.size(); i += 2)
+        next.push_back(op(terms[i], terms[i + 1]));
+      if (terms.size() % 2 == 1)
+        next.push_back(terms.back());
+      terms = std::move(next);
+    }
+    return terms[0];
+  };
+
+  auto orPart = treeReduce(
+      ors, [&b](Value x, Value y) { return b.or_(x, y, /*disjoint=*/true); });
+  auto xorPart =
+      treeReduce(xors, [&b](Value x, Value y) { return b.xor_(x, y); });
+  return b.or_(orPart, xorPart, /*disjoint=*/true);
 }
 
 } // namespace triton::gpu
diff --git a/test/Conversion/amd/convert_layout.mlir b/test/Conversion/amd/convert_layout.mlir
@@ -12,7 +12,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
 
     // Part of offset computation generated by applyLinearLayout function
     // CHECK: [[SEL:%.*]]= llvm.select {{.*}}, {{.*}}, [[CST_128]]
-    // CHECK: [[OFFSET_0:%.*]] = llvm.xor {{.*}}, [[SEL]]
+    // CHECK-COUNT-3: llvm.or disjoint
+    // CHECK-COUNT-2: llvm.xor
+    // CHECK: [[OFFSET_0:%.*]] = llvm.or disjoint
     // CHECK: [[OFFSET_1:%.*]] = llvm.xor {{.*}}, [[OFFSET_0]] : i32
 
     // Part of offset computation generated by lowerLdSt function after applyLinearLayout