[BACKEND] Simplify applyLinearLayout (#7417)

apgoucher · wdziurdz · commit 83caa5824db9 · 2025-08-12T14:25:55.000Z
This also improves the runtime of one of our kernels on Hopper.
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -236,41 +236,21 @@ applyLinearLayout(Location loc, RewriterBase &rewriter,
     return outIndices;
   }
 
-  // Happy path: Only one output.
-  if (outIndices.size() == 1) {
-    SmallVector<StringAttr> inDimNames;
-    // Concatenate input
-    Value x = b.i32_val(0);
-    int shift = 0;
-    for (auto [inDimName, idx] : nonConstantIns) {
-      inDimNames.push_back(inDimName);
-      x = b.or_(x, b.shl(idx, b.i32_val(shift)));
-      shift += layout.getInDimSizeLog2(inDimName);
-    }
-    // Flatten ins
-    auto matrix = layout.sublayout(inDimNames, outIndices[0].first);
-    matrix = matrix.flattenIns();
+  SmallVector<StringAttr> inDimNames;
+  // Concatenate input
+  Value x = b.i32_val(0);
+  int shift = 0;
+  for (auto [inDimName, idx] : nonConstantIns) {
+    inDimNames.push_back(inDimName);
+    x = b.or_(x, b.shl(idx, b.i32_val(shift)));
+    shift += layout.getInDimSizeLog2(inDimName);
+  }
+
+  for (auto &[outDimName, outIdx] : outIndices) {
+    // Apply flattened sublayout for this output
+    auto matrix = layout.sublayout(inDimNames, outDimName).flattenIns();
     auto out = triton::gpu::matrixVectorProd(b, matrix, x);
-    outIndices[0].second = b.xor_(outIndices[0].second, out);
-    return outIndices;
-  }
-
-  for (auto [inDimName, idx] : indices) {
-    APInt constant;
-    if (matchPattern(idx, m_ConstantInt(&constant))) {
-      continue;
-    }
-    int nBits = layout.getInDimSizeLog2(inDimName);
-    for (int i = 0; i < nBits; i++) {
-      Value bit = b.and_(idx, b.i32_val(1 << i));
-      Value bit_is_zero = b.icmp_eq(bit, zero);
-      for (auto &[outDimName, outIdx] : outIndices) {
-        int32_t basis = layout.getBasis(inDimName, i, outDimName);
-        if (basis == 0)
-          continue;
-        outIdx = b.xor_(outIdx, b.select(bit_is_zero, zero, b.i32_val(basis)));
-      }
-    }
+    outIdx = b.xor_(outIdx, out);
   }
 
   return outIndices;