Partially Revert "[LAYOUTS] Enable diagonal iteration unconditionally (#7218)" (#7245)

lezcano · web-flow · commit c8a711de562b · 2025-06-19T19:18:22.000Z
We are seeing some internal regressions. This reverts commit 336cc1d.
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -152,10 +152,6 @@ applyLinearLayout(Location loc, RewriterBase &rewriter,
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   assert(layout.getNumInDims() == indices.size());
   assert(llvm::equal(layout.getInDimNames(), llvm::make_first_range(indices)));
-  // Trivial layout
-  if (layout.getNumOutDims() == 0) {
-    return {};
-  }
 
   // This function can emit a lot of MLIR code, which ultimately makes
   // compilation slow.  (We think this shouldn't be the case -- it's not *that*
@@ -169,65 +165,62 @@ applyLinearLayout(Location loc, RewriterBase &rewriter,
   SmallVector<std::pair<StringAttr, int32_t>> constantIns;
   SmallVector<std::pair<StringAttr, Value>> nonConstantIns;
   for (auto [inDimName, idx] : indices) {
-    APInt constant;
-    if (matchPattern(idx, m_ConstantInt(&constant))) {
-      constantIns.push_back({inDimName, constant.getSExtValue()});
+    if (auto constant = idx.getDefiningOp<LLVM::ConstantOp>()) {
+      constantIns.push_back(
+          {inDimName, cast<IntegerAttr>(constant.getValue()).getInt()});
     } else {
       constantIns.push_back({inDimName, 0});
       nonConstantIns.push_back({inDimName, idx});
     }
   }
+  SmallVector<int32_t> constantComponent =
+      llvm::to_vector(llvm::make_second_range(layout.apply(constantIns)));
 
-  // Compute constant part of the output and wrap it as values
   Value zero = b.i32_val(0);
   SmallVector<std::pair<StringAttr, Value>> outIndices;
-  for (auto [outDimName, constant] : layout.apply(constantIns)) {
-    if (constant == 0)
+  for (auto [i, outDimName] : llvm::enumerate(layout.getOutDimNames())) {
+    if (constantComponent[i] == 0)
       outIndices.push_back({outDimName, zero});
     else
-      outIndices.push_back({outDimName, b.i32_val(constant)});
-  }
-
-  if (nonConstantIns.size() == 0) {
-    return outIndices;
+      outIndices.push_back({outDimName, b.i32_val(constantComponent[i])});
   }
-
-  // Concatenate input
-  Value x = b.i32_val(0);
-  if (nonConstantIns.size() == 1) {
-    x = nonConstantIns[0].second;
-  } else {
+  // Happy path: Only one output.
+  if (outIndices.size() == 1) {
+    SmallVector<StringAttr> inDimNames;
+    // Concatenate input
+    Value x = b.i32_val(0);
     int shift = 0;
     for (auto [inDimName, idx] : nonConstantIns) {
+      inDimNames.push_back(inDimName);
       x = b.or_(x, b.shl(idx, b.i32_val(shift)));
       shift += layout.getInDimSizeLog2(inDimName);
     }
+    // Flatten ins
+    auto matrix = layout.sublayout(inDimNames, outIndices[0].first);
+    matrix = matrix.flattenIns();
+    auto out = triton::gpu::matrixVectorProd(b, matrix, x);
+    outIndices[0].second = b.xor_(outIndices[0].second, out);
+    return outIndices;
   }
 
-  // Remove constant input dims from the layout and flatten it
-  auto inDimNames = llvm::to_vector(llvm::make_first_range(nonConstantIns));
-  auto matrix = layout.sublayout(
-      inDimNames, llvm::to_vector(llvm::make_first_range(outIndices)));
-  auto flatMatrix = matrix.flattenIns().flattenOuts();
-
-  // Lower the matrix-vector product
-  auto out = triton::gpu::matrixVectorProd(b, flatMatrix, x);
+  for (auto [inDimName, idx] : indices) {
+    if (idx.getDefiningOp<LLVM::ConstantOp>()) {
+      continue;
+    }
 
-  // Unpack the output
-  if (matrix.getNumOutDims() == 1) {
-    outIndices[0].second = b.xor_(outIndices[0].second, out);
-  } else {
-    assert(llvm::equal(matrix.getOutDimNames(),
-                       llvm::make_first_range(outIndices)));
-    int shift = 0;
-    for (auto &[dimName, outIdx] : outIndices) {
-      auto outDimSizeLog2 = layout.getOutDimSizeLog2(dimName);
-      auto mask = (1 << outDimSizeLog2) - 1;
-      outIdx = b.xor_(outIdx,
-                      b.and_(b.lshr(out, b.i32_val(shift)), b.i32_val(mask)));
-      shift += outDimSizeLog2;
+    int nBits = layout.getInDimSizeLog2(inDimName);
+    for (int i = 0; i < nBits; i++) {
+      Value bit = b.and_(idx, b.i32_val(1 << i));
+      Value bit_is_zero = b.icmp_eq(bit, zero);
+      for (auto &[outDimName, outIdx] : outIndices) {
+        int32_t basis = layout.getBasis(inDimName, i, outDimName);
+        if (basis == 0)
+          continue;
+        outIdx = b.xor_(outIdx, b.select(bit_is_zero, zero, b.i32_val(basis)));
+      }
     }
   }
+
   return outIndices;
 }