Revert "[LLs] Tree-reduce the xor reduction in LLs codegen (#7816)"

whitneywhtsang · whitneywhtsang · commit 99796f42aa3e · 2025-08-23T00:11:45.000Z
This reverts commit 4da227c.
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -158,54 +158,39 @@ Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x) {
   SmallVector<int32_t> matrix = flatten(A.getBases().begin()->second);
   assert(matrix.size() == nCol);
 
-  // Row-wise popcount to detect rows that appear exactly once across columns.
-  uint32_t rowsUnique = 0;
-  {
-    SmallVector<int> rowPopCnt(nRow, 0);
-    for (int c = 0; c < nCol; ++c) {
-      uint32_t colBits = matrix[c];
-      for (int r = 0; r < nRow; ++r) {
-        if (colBits & (1u << r))
-          ++rowPopCnt[r];
-      }
-    }
-    for (int r = 0; r < nRow; ++r) {
-      if (rowPopCnt[r] == 1)
-        rowsUnique |= 1u << r;
-    }
-  }
-
-  // We iterate the matrix following the diagonals and build
-  // (x & mask_i) << s_i terms. Prefer OR for diagonals whose rows are unique,
-  // then XOR everything else. This tends to encourage mad.lo codegen.
-  auto getMaskAndAllRowsUnique = [&](int i) -> std::pair<uint32_t, bool> {
+  // We iterate the matrix following the diagonals
+  // The idea here is that we want to generate code of the form:
+  // \xor_i (x & mask_i) << s_i
+  // where s_i may by positive or negative (left or right shift)
+  // The hope here (and we see it in codegen) is that LLVM can turn
+  // the xor into a sum and then the sum + LHS/RHS can be fused into a mad.lo
+  // Get the i-th diagonal
+  auto getMask = [&](int i) {
     uint32_t mask = 0;
     int row = i < 0 ? -i : 0;
     int col = i < 0 ? 0 : i;
-    bool allRowsUnique = true;
     while (row < nRow && col < nCol) {
       uint32_t bitValue = (matrix[col] >> row) & 1u;
       mask |= bitValue << col;
-      allRowsUnique &= ((rowsUnique >> row) & 1u) == 1u;
       ++row;
       ++col;
     }
-    return {mask, allRowsUnique};
+    return mask;
   };
 
   uint32_t explicitCols = 0;
 
   {
     SmallVector<uint32_t> masks;
     for (int i = -nRow + 1; i < nCol; i++) {
-      masks.push_back(std::get<0>(getMaskAndAllRowsUnique(i)));
+      masks.push_back(getMask(i));
     }
     bool reachedFixedPoint = false;
     while (!reachedFixedPoint) {
       reachedFixedPoint = true;
       for (uint32_t m : masks) {
         uint32_t c = m & ~explicitCols;
-        if (llvm::isPowerOf2_32(c)) {
+        if ((c != 0) && ((c & (c - 1)) == 0)) {
           // found a single-element diagonal
           explicitCols |= c;
           reachedFixedPoint = false;
@@ -215,21 +200,14 @@ Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x) {
   }
 
   // handle any diagonals that have survived
-  SmallVector<Value> ors;
-  SmallVector<Value> xors;
+  Value ret = b.i32_val(0);
   for (int i = -nRow + 1; i < nCol; i++) {
-    auto [mask, allRowsUnique] = getMaskAndAllRowsUnique(i);
-    mask &= ~explicitCols;
+    auto mask = getMask(i) & ~explicitCols;
     if (mask == 0)
       continue;
     auto masked = b.and_(x, b.i32_val(mask));
-    auto shifted = i >= 0 ? Value(b.lshr(masked, b.i32_val(i)))
-                          : Value(b.shl(masked, b.i32_val(-i)));
-    if (allRowsUnique) {
-      ors.push_back(shifted);
-    } else {
-      xors.push_back(shifted);
-    }
+    ret = b.xor_(ret, i >= 0 ? Value(b.lshr(masked, b.i32_val(i)))
+                             : Value(b.shl(masked, b.i32_val(-i))));
   }
 
   // handle any explicit columns:
@@ -241,35 +219,10 @@ Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x) {
       int32_t basis = matrix[i];
       if (basis == 0)
         continue;
-      auto select = b.select(bit_is_zero, zero, b.i32_val(basis));
-      if ((rowsUnique & basis) == basis) {
-        ors.push_back(select);
-      } else {
-        xors.push_back(select);
-      }
+      ret = b.xor_(ret, b.select(bit_is_zero, zero, b.i32_val(basis)));
     }
   }
-
-  auto treeReduce = [&](SmallVector<Value> &terms,
-                        std::function<Value(Value, Value)> op) -> Value {
-    if (terms.empty())
-      return b.i32_val(0);
-    while (terms.size() > 1) {
-      SmallVector<Value> next;
-      for (size_t i = 0; i + 1 < terms.size(); i += 2)
-        next.push_back(op(terms[i], terms[i + 1]));
-      if (terms.size() % 2 == 1)
-        next.push_back(terms.back());
-      terms = std::move(next);
-    }
-    return terms[0];
-  };
-
-  auto orPart = treeReduce(
-      ors, [&b](Value x, Value y) { return b.or_(x, y, /*disjoint=*/true); });
-  auto xorPart =
-      treeReduce(xors, [&b](Value x, Value y) { return b.xor_(x, y); });
-  return b.or_(orPart, xorPart, /*disjoint=*/true);
+  return ret;
 }
 
 } // namespace triton::gpu
diff --git a/test/Conversion/amd/convert_layout.mlir b/test/Conversion/amd/convert_layout.mlir
@@ -12,9 +12,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
 
     // Part of offset computation generated by applyLinearLayout function
     // CHECK: [[SEL:%.*]]= llvm.select {{.*}}, {{.*}}, [[CST_128]]
-    // CHECK-COUNT-3: llvm.or disjoint
-    // CHECK-COUNT-2: llvm.xor
-    // CHECK: [[OFFSET_0:%.*]] = llvm.or disjoint
+    // CHECK: [[OFFSET_0:%.*]] = llvm.xor {{.*}}, [[SEL]]
     // CHECK: [[OFFSET_1:%.*]] = llvm.xor {{.*}}, [[OFFSET_0]] : i32
 
     // Part of offset computation generated by lowerLdSt function after applyLinearLayout