Merge commit '336cc1d530fe9df8db610e880330b9fa4de82925'

whitneywhtsang · whitneywhtsang · commit fe790647ea08 · 2025-06-23T22:29:29.000Z
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -309,8 +309,6 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     auto totalStoreCvt = srcLayout.invertAndCompose(smem);
     auto totalLoadCvt = dstLayout.invertAndCompose(smem);
 
-    // FIXME(Lezcano): The legacy path also creates PRMT, so we should revisit
-
     // The permutation exists by construction of the reps dimension in
     // optimalSwizzling
     auto permStore =
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -151,6 +151,10 @@ applyLinearLayout(Location loc, RewriterBase &rewriter,
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   assert(layout.getNumInDims() == indices.size());
   assert(llvm::equal(layout.getInDimNames(), llvm::make_first_range(indices)));
+  // Trivial layout
+  if (layout.getNumOutDims() == 0) {
+    return {};
+  }
 
   // This function can emit a lot of MLIR code, which ultimately makes
   // compilation slow.  (We think this shouldn't be the case -- it's not *that*
@@ -164,62 +168,65 @@ applyLinearLayout(Location loc, RewriterBase &rewriter,
   SmallVector<std::pair<StringAttr, int32_t>> constantIns;
   SmallVector<std::pair<StringAttr, Value>> nonConstantIns;
   for (auto [inDimName, idx] : indices) {
-    if (auto constant = idx.getDefiningOp<LLVM::ConstantOp>()) {
-      constantIns.push_back(
-          {inDimName, cast<IntegerAttr>(constant.getValue()).getInt()});
+    APInt constant;
+    if (matchPattern(idx, m_ConstantInt(&constant))) {
+      constantIns.push_back({inDimName, constant.getSExtValue()});
     } else {
       constantIns.push_back({inDimName, 0});
       nonConstantIns.push_back({inDimName, idx});
     }
   }
-  SmallVector<int32_t> constantComponent =
-      llvm::to_vector(llvm::make_second_range(layout.apply(constantIns)));
 
+  // Compute constant part of the output and wrap it as values
   Value zero = b.i32_val(0);
   SmallVector<std::pair<StringAttr, Value>> outIndices;
-  for (auto [i, outDimName] : llvm::enumerate(layout.getOutDimNames())) {
-    if (constantComponent[i] == 0)
+  for (auto [outDimName, constant] : layout.apply(constantIns)) {
+    if (constant == 0)
       outIndices.push_back({outDimName, zero});
     else
-      outIndices.push_back({outDimName, b.i32_val(constantComponent[i])});
+      outIndices.push_back({outDimName, b.i32_val(constant)});
+  }
+
+  if (nonConstantIns.size() == 0) {
+    return outIndices;
   }
-  // Happy path: Only one output.
-  if (outIndices.size() == 1) {
-    SmallVector<StringAttr> inDimNames;
-    // Concatenate input
-    Value x = b.i32_val(0);
+
+  // Concatenate input
+  Value x = b.i32_val(0);
+  if (nonConstantIns.size() == 1) {
+    x = nonConstantIns[0].second;
+  } else {
     int shift = 0;
     for (auto [inDimName, idx] : nonConstantIns) {
-      inDimNames.push_back(inDimName);
       x = b.or_(x, b.shl(idx, b.i32_val(shift)));
       shift += layout.getInDimSizeLog2(inDimName);
     }
-    // Flatten ins
-    auto matrix = layout.sublayout(inDimNames, outIndices[0].first);
-    matrix = matrix.flattenIns();
-    auto out = triton::gpu::matrixVectorProd(b, matrix, x);
-    outIndices[0].second = b.xor_(outIndices[0].second, out);
-    return outIndices;
   }
 
-  for (auto [inDimName, idx] : indices) {
-    if (idx.getDefiningOp<LLVM::ConstantOp>()) {
-      continue;
-    }
+  // Remove constant input dims from the layout and flatten it
+  auto inDimNames = llvm::to_vector(llvm::make_first_range(nonConstantIns));
+  auto matrix = layout.sublayout(
+      inDimNames, llvm::to_vector(llvm::make_first_range(outIndices)));
+  auto flatMatrix = matrix.flattenIns().flattenOuts();
+
+  // Lower the matrix-vector product
+  auto out = triton::gpu::matrixVectorProd(b, flatMatrix, x);
 
-    int nBits = layout.getInDimSizeLog2(inDimName);
-    for (int i = 0; i < nBits; i++) {
-      Value bit = b.and_(idx, b.i32_val(1 << i));
-      Value bit_is_zero = b.icmp_eq(bit, zero);
-      for (auto &[outDimName, outIdx] : outIndices) {
-        int32_t basis = layout.getBasis(inDimName, i, outDimName);
-        if (basis == 0)
-          continue;
-        outIdx = b.xor_(outIdx, b.select(bit_is_zero, zero, b.i32_val(basis)));
-      }
+  // Unpack the output
+  if (matrix.getNumOutDims() == 1) {
+    outIndices[0].second = b.xor_(outIndices[0].second, out);
+  } else {
+    assert(llvm::equal(matrix.getOutDimNames(),
+                       llvm::make_first_range(outIndices)));
+    int shift = 0;
+    for (auto &[dimName, outIdx] : outIndices) {
+      auto outDimSizeLog2 = layout.getOutDimSizeLog2(dimName);
+      auto mask = (1 << outDimSizeLog2) - 1;
+      outIdx = b.xor_(outIdx,
+                      b.and_(b.lshr(out, b.i32_val(shift)), b.i32_val(mask)));
+      shift += outDimSizeLog2;
     }
   }
-
   return outIndices;
 }
 
diff --git a/lib/Tools/GenericSwizzling.cpp b/lib/Tools/GenericSwizzling.cpp
@@ -311,8 +311,9 @@ LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
   // Bits in a bank segment: 32 banks x 32 bits
   constexpr int32_t bankBits = 32 * 32;
   // Bases needed to cover a whole bank segment
-  const int32_t lenBbasis =
-      llvm::Log2_32(bankBits / ((1 << vbasis.size()) * bitwidth));
+  const int32_t lenBbasis = std::min<int32_t>(
+      llvm::Log2_32(bankBits / ((1 << vbasis.size()) * bitwidth)),
+      dim - vbasis.size());
   // Bases to cover all the tensor
   const int32_t lenSbasis = dim - lenBbasis - vbasis.size();
 
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
@@ -384,6 +384,17 @@ def _p_matmul_ogs(
                 block_shape=[BLOCK_M, OUT_BLOCK_N],
             )
 
+        # bias + scale
+        offs_y_n = off_n1 + tl.arange(0, BLOCK_N)
+        mask_n = offs_y_n < N
+        if B is not None:
+            BPtrs = B + expt_id1 * stride_b_e + offs_y_n
+            if pid_k1 == 0:
+                bias = tl.load(BPtrs, mask=mask_n, other=0)
+            else:
+                bias = tl.full([BLOCK_N], 0, dtype=tl.float32)
+        else:
+            bias = tl.full([BLOCK_N], 0, dtype=tl.float32)
         if Betas is not None:
             betas = tl.load(Betas + start_m1 + offs_m, mask=mask_m, other=0.0)
         else:
@@ -399,15 +410,21 @@ def _p_matmul_ogs(
             w_scale = load_scale(WScale)
 
         accs = (acc,)
+        biases = (bias,)
 
         if SUBTILE_FACTOR >= 2:
             acc0, acc1 = acc.reshape(BLOCK_M, 2, BLOCK_N // 2).permute(0, 2, 1).split()
             accs = (acc0, acc1)
+            bias0, bias1 = bias.reshape(2, BLOCK_N // 2).permute(1, 0).split()
+            biases = (bias0, bias1)
 
         if SUBTILE_FACTOR >= 4:
             acc00, acc01 = acc0.reshape(BLOCK_M, 2, BLOCK_N // 4).permute(0, 2, 1).split()
             acc10, acc11 = acc1.reshape(BLOCK_M, 2, BLOCK_N // 4).permute(0, 2, 1).split()
             accs = (acc00, acc01, acc10, acc11)
+            bias00, bias01 = bias0.reshape(2, BLOCK_N // 4).permute(1, 0).split()
+            bias10, bias11 = bias1.reshape(2, BLOCK_N // 4).permute(1, 0).split()
+            biases = (bias00, bias01, bias10, bias11)
 
         tl.static_assert(EPILOGUE_BLOCK_N == BLOCK_N // SUBTILE_FACTOR)
         tl.static_assert(len(accs) == SUBTILE_FACTOR)
@@ -419,18 +436,7 @@ def _p_matmul_ogs(
             if SWAP_XW:
                 acc_tile = acc_tile.T
 
-            if B is not None:
-                offs_y_n = off_n1 + EPILOGUE_BLOCK_N * a_i + tl.arange(0, EPILOGUE_BLOCK_N)
-                mask_n = offs_y_n < N
-                BPtrs = B + expt_id1 * stride_b_e + offs_y_n
-                if pid_k1 == 0:
-                    bias = tl.load(BPtrs, mask=mask_n, other=0)
-                else:
-                    bias = tl.full([EPILOGUE_BLOCK_N], 0, dtype=tl.float32)
-            else:
-                bias = tl.full([EPILOGUE_BLOCK_N], 0, dtype=tl.float32)
-
-            acc_tile = acc_tile + bias[None, :] * betas[:, None]
+            acc_tile = acc_tile + biases[a_i][None, :] * betas[:, None]
             if out_alpha is not None:
                 acc_tile *= out_alpha
 
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1293,7 +1293,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   // CHECK-LABEL: linear_layout_with_multiple_iterations
   tt.func @linear_layout_with_multiple_iterations(%src: tensor<8x4xbf16, #linear>) {
     %cvt = ttg.convert_layout %src : tensor<8x4xbf16, #linear> -> tensor<8x4xbf16, #linear1>
-    // CHECK-COUNT-2: llvm.store {{.*}} : vector<2xi16>
+    // CHECK-COUNT-1: llvm.store {{.*}} : vector<4xi16>
     // CHECK: nvvm.barrier0
     // CHECK-COUNT: llvm.load{{.*}}->vector<2xi16>
     tt.return