[AMD] Support register broadcast in slice/concat ops (#7407)

alefimov-amd · binarman · web-flow · commit cdd7a3d9a6d9 · 2025-07-10T14:50:28.000-07:00
This PR:

- permits arbitrary broadcasted register in layouts
- fixes few possible crashes in verifier in case of broadcasted layouts

---------

Co-authored-by: Alexander Efimov &lt;efimov.alexander@gmail.com&gt;
diff --git a/test/Conversion/amd/invalid_concat_op.mlir b/test/Conversion/amd/invalid_concat_op.mlir
@@ -157,18 +157,19 @@ module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32,
 // -----
 
 // Different layouts 2
-#src_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [64, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
-#dst_layout = #ttg.linear<{register=[[0, 0], [0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [0, 128], [64, 0], [128, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
+// Case when src and dst layouts have same CTA tile shape, but different number of registers
+#src_layout = #ttg.linear<{register=[[1, 0], [2, 0]], lane=[[4, 0], [8, 0], [16, 0], [0, 1], [0, 2], [0, 4]], warp=[[0, 0], [0, 8]], block=[]}>
+#dst_layout = #ttg.linear<{register=[[1, 0]], lane=[[4, 0], [8, 0], [16, 0], [0, 1], [0, 2], [0, 4]], warp=[[2, 0], [0, 8]], block=[]}>
 module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
   tt.func @invalid_concat(
-    %arg0: tensor<128x128xf32, #src_layout>,
-    %arg1: tensor<128x128xf32, #src_layout>,
-    %arg2: tensor<128x128xf32, #src_layout>,
-    %arg3: tensor<128x128xf32, #src_layout>) {
+    %arg0: tensor<32x16xf32, #src_layout>,
+    %arg1: tensor<32x16xf32, #src_layout>,
+    %arg2: tensor<32x16xf32, #src_layout>,
+    %arg3: tensor<32x16xf32, #src_layout>) {
 
     // expected-error @+1 {{Register basis must match on a CTA tile between source and destination.}}
     %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3:
-    tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout> -> tensor<256x256xf32, #dst_layout>
+    tensor<32x16xf32, #src_layout>, tensor<32x16xf32, #src_layout>, tensor<32x16xf32, #src_layout>, tensor<32x16xf32, #src_layout> -> tensor<64x32xf32, #dst_layout>
     tt.return
   }
 }
diff --git a/test/Conversion/amd/invalid_extractslice_to_llvm.mlir b/test/Conversion/amd/invalid_extractslice_to_llvm.mlir
@@ -83,20 +83,23 @@ tt.func @invalid_non_static_offset(%arg0: tensor<256x128xi32, #blocked1> {tt.div
 
 // Invalid layout 1
 #dst_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [64, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
-#src_layout = #ttg.linear<{register=[[0, 0], [0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [0, 128], [64, 0], [128, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
-tt.func @invalid_register_base(%arg0: tensor<256x256xi32, #src_layout> {tt.divisibility = 16 : i32}) {
-  // expected-error @+1 {{Register basis must match on a CTA tile between source and destination}}
+#src_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [0, 128], [64, 0], [128, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4], [0, 0]], warp=[[0, 32], [32, 0]], block=[]}>
+tt.func @invalid_lane_warp_basis(%arg0: tensor<256x256xi32, #src_layout> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{Lane and warp dim basis must match between source and destination layout}}
   %2 = amdgpu.extract_slice %arg0 [0, 0] : tensor<256x256xi32, #src_layout> to tensor<128x128xi32, #dst_layout>
   tt.return
 }
 
 // -----
 
 // Invalid layout 2
-#dst_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [64, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
-#src_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [0, 128], [64, 0], [128, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4], [0, 0]], warp=[[0, 32], [32, 0]], block=[]}>
-tt.func @invalid_lane_warp_basis(%arg0: tensor<256x256xi32, #src_layout> {tt.divisibility = 16 : i32}) {
-  // expected-error @+1 {{Lane and warp dim basis must match between source and destination layout}}
-  %2 = amdgpu.extract_slice %arg0 [0, 0] : tensor<256x256xi32, #src_layout> to tensor<128x128xi32, #dst_layout>
-  tt.return
+// Case when src and dst layouts have same CTA tile shape, but different number of registers
+#src_layout = #ttg.linear<{register=[[1, 0], [2, 0]], lane=[[4, 0], [8, 0], [16, 0], [0, 1], [0, 2], [0, 4]], warp=[[0, 0], [0, 8]], block=[]}>
+#dst_layout = #ttg.linear<{register=[[1, 0]], lane=[[4, 0], [8, 0], [16, 0], [0, 1], [0, 2], [0, 4]], warp=[[2, 0], [0, 8]], block=[]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @invalid_concat(%arg0: tensor<64x32xi32, #src_layout>) {
+    // expected-error @+1 {{Register basis must match on a CTA tile between source and destination.}}
+    %1 = amdgpu.extract_slice %arg0 [0, 0] : tensor<64x32xi32, #src_layout> to tensor<32x16xi32, #dst_layout>
+    tt.return
+  }
 }
diff --git a/test/TritonGPU/amd/amd-concat-op.mlir b/test/TritonGPU/amd/amd-concat-op.mlir
@@ -103,3 +103,39 @@ module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32,
     tt.return
   }
 }
+
+// -----
+
+// Each input tensor broadcasts 4 registers along dimension 1, resulting in total 16 values per input.
+// Output tensor do not have redundancy in registers and holds 8 values.
+// Check that concat copies only 4 values from each input tensor, 8 in total.
+#src_layout = #ttg.linear<{register=[[0, 0], [0, 0], [1, 0], [2, 0]], lane=[[0, 0], [0, 0], [0, 0], [4, 0], [8, 0], [16, 0]], warp=[[0, 0], [32, 0], [64, 0]], block=[]}>
+#dst_layout = #ttg.linear<{register=[                [1, 0], [2, 0]], lane=[[0, 0], [0, 0], [0, 0], [4, 0], [8, 0], [16, 0]], warp=[[0, 0], [32, 0], [64, 0]], block=[]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @concat_from_broadcasted_tensor(%arg0: tensor<128x1xi32, #src_layout>, %arg1: tensor<128x1xi32, #src_layout> {tt.divisibility = 16 : i32}) {
+    // CHECK-LABEL: llvm.func @concat_from_broadcasted_tensor
+    // CHECK-COUNT-16: %{{.*}} = llvm.extractvalue %arg0[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-16: %{{.*}} = llvm.extractvalue %arg1[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.insertvalue %{{.*}} : !llvm.struct
+    %1 = amdgpu.concat %arg0, %arg1: tensor<128x1xi32, #src_layout>, tensor<128x1xi32, #src_layout> -> tensor<256x1xi32, #dst_layout>
+    tt.return
+  }
+}
+
+// -----
+
+// Input tensors do not have redundancy in register and hold 4 values each.
+// Output tensor broadcasts 4 registers along dimension 1, resulting in total 32 values.
+// Check that concat duplicates 4 values from each input 4 times, resulting in total 32 values.
+#src_layout = #ttg.linear<{register=[                [1, 0], [2, 0]], lane=[[0, 0], [0, 0], [0, 0], [4, 0], [8, 0], [16, 0]], warp=[[0, 0], [32, 0], [64, 0]], block=[]}>
+#dst_layout = #ttg.linear<{register=[[0, 0], [0, 0], [1, 0], [2, 0]], lane=[[0, 0], [0, 0], [0, 0], [4, 0], [8, 0], [16, 0]], warp=[[0, 0], [32, 0], [64, 0]], block=[]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @concat_to_broadcasted_tensor(%arg0: tensor<128x1xi32, #src_layout>, %arg1: tensor<128x1xi32, #src_layout> {tt.divisibility = 16 : i32}) {
+    // CHECK-LABEL: llvm.func @concat_to_broadcasted_tensor
+    // CHECK-COUNT-4: %{{.*}} = llvm.extractvalue %arg0[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-4: %{{.*}} = llvm.extractvalue %arg1[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-32: %{{.*}} = llvm.insertvalue %{{.*}} : !llvm.struct
+    %1 = amdgpu.concat %arg0, %arg1: tensor<128x1xi32, #src_layout>, tensor<128x1xi32, #src_layout> -> tensor<256x1xi32, #dst_layout>
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/amd/amd-extractslice-op.mlir b/test/TritonGPU/amd/amd-extractslice-op.mlir
@@ -55,3 +55,37 @@ module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32,
     tt.return
   }
 }
+
+// -----
+
+// Input tensor broadcasts 4 registers along dimension 1, resulting in total 32 values in tensor and 16 values per [128x1] tile.
+// Output tensor do not have redundancy in register and holds 4 values.
+// Test checks that extract slice copies only 4 values from input to output.
+#blocked1 = #ttg.linear<{register=[[0, 0], [0, 0], [1, 0], [2, 0], [128, 0]], lane=[[0, 0], [0, 0], [0, 0], [4, 0], [8, 0], [16, 0]], warp=[[0, 0], [32, 0], [64, 0]], block=[]}>
+#blocked2 = #ttg.linear<{register=[                [1, 0], [2, 0]],           lane=[[0, 0], [0, 0], [0, 0], [4, 0], [8, 0], [16, 0]], warp=[[0, 0], [32, 0], [64, 0]], block=[]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @extract_from_broadcasted_tensor(%arg0: tensor<256x1xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+    // CHECK-LABEL: llvm.func @extract_from_broadcasted_tensor
+    // CHECK-COUNT-32: %{{.*}} = llvm.extractvalue  %{{.*}} : !llvm.struct
+    // CHECK-COUNT-4:  %{{.*}} = llvm.insertvalue %{{.*}} : !llvm.struct
+    %0 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x1xi32, #blocked1> to tensor<128x1xi32, #blocked2>
+    tt.return
+  }
+}
+
+// -----
+
+// Input tensor do not have broadcasted registers, resulting in total 8 values in tensor and 4 values per [128x1] tile.
+// Output tensor broadcasts 4 registers along dimension 1 and total 16 values.
+// Test checks that extract slice duplicates 4 values from input in 16 output values.
+#blocked1 = #ttg.linear<{register=[                [1, 0], [2, 0], [128, 0]], lane=[[0, 0], [0, 0], [0, 0], [4, 0], [8, 0], [16, 0]], warp=[[0, 0], [32, 0], [64, 0]], block=[]}>
+#blocked2 = #ttg.linear<{register=[[0, 0], [0, 0], [1, 0], [2, 0]],           lane=[[0, 0], [0, 0], [0, 0], [4, 0], [8, 0], [16, 0]], warp=[[0, 0], [32, 0], [64, 0]], block=[]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @extract_to_broadcasted_tensor(%arg0: tensor<256x1xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+    // CHECK-LABEL: llvm.func @extract_to_broadcasted_tensor
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue  %{{.*}} : !llvm.struct
+    // CHECK-COUNT-16:  %{{.*}} = llvm.insertvalue %{{.*}} : !llvm.struct
+    %72 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x1xi32, #blocked1> to tensor<128x1xi32, #blocked2>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp b/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp
@@ -72,6 +72,10 @@ bool hasMatchingCTATileLayoutForSliceConcat(
   auto dstLL = triton::gpu::toLinearLayout(dstShape, dstTy.getEncoding());
 
   MLIRContext *ctx = srcTy.getContext();
+  auto kReg = StringAttr::get(ctx, "register");
+  srcLL = srcLL.removeZeroBasesAlongDim(kReg);
+  dstLL = dstLL.removeZeroBasesAlongDim(kReg);
+
   auto getBases = [&](StringRef name) {
     auto key = StringAttr::get(ctx, name);
     return std::pair{srcLL.getBases().lookup(key),
@@ -98,14 +102,22 @@ bool hasMatchingCTATileLayoutForSliceConcat(
     numCTAs *= srcShape[d] / shapeCTASrc[d];
   }
 
-  unsigned elemsPerThreadPerCTA =
-      triton::gpu::getTotalElemsPerThread(srcTy) / numCTAs;
-  unsigned regCompareLen = std::log2(elemsPerThreadPerCTA);
+  assert(llvm::isPowerOf2_32(numCTAs) &&
+         "expect number of CTAs to be power of 2");
+
+  unsigned totalElemsPerThreadNoBroadcastLog = regSrc.size();
+  unsigned elemsPerThreadPerCTALog =
+      totalElemsPerThreadNoBroadcastLog - llvm::Log2_32(numCTAs);
+  unsigned regCompareLen = elemsPerThreadPerCTALog;
 
   auto compareBasis = [&](auto &srcBasis, auto &dstBasis, StringRef message,
                           int limit = -1) {
     int n = (limit < 0 ? srcBasis.size()
                        : std::min<unsigned>(srcBasis.size(), limit));
+    if (dstBasis.size() < n) {
+      emitError(message);
+      return false;
+    }
     for (size_t i = 0; i < n; ++i) {
       if (srcBasis[i] != dstBasis[i]) {
         emitError(message);
diff --git a/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ConcatOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ConcatOpToLLVM.cpp
@@ -33,7 +33,11 @@ struct ConcatOpConversion : public ConvertOpToLLVMPattern<amdgpu::ConcatOp> {
 
     MLIRContext *context = resultType.getContext();
     auto linearLayoutSrc = triton::gpu::toLinearLayout(srcShape, srcEncoding);
-    auto linearLayoutDst = triton::gpu::toLinearLayout(dstShape, dstEncoding);
+    auto outDimNames = llvm::to_vector(linearLayoutSrc.getOutDimNames());
+    // Call transposeOuts, to ensure that order of input and output tensor
+    // element coordinates are compatible on stage 8 in algorithm below.
+    auto linearLayoutDst = triton::gpu::toLinearLayout(dstShape, dstEncoding)
+                               .transposeOuts(outDimNames);
     auto srcCTAOrder = LLVM::AMD::getCTATileOrder(context, linearLayoutSrc);
     auto dstCTAOrder = LLVM::AMD::getCTATileOrder(context, linearLayoutSrc);
 
@@ -69,34 +73,77 @@ struct ConcatOpConversion : public ConvertOpToLLVMPattern<amdgpu::ConcatOp> {
       unpackedSources.push_back(unpackLLElements(loc, currSrc, rewriter));
     }
 
-    // Traverse CTA tiles in the result tensor
-    for (int i = 0; i < numCTATiles; ++i) {
-      auto currTileIdx = mlir::LLVM::delinearize(i, dstCTAShape, dstCTAOrder);
+    // Algorithm:
+    // 1. for all registers in src tensor
+    // 2.   compute src location in tensor relative to tile beginnig
+    // 3.   save mapping from src elem coordinates to register idx
+    // 4. for all elements in dst tensor
+    // 5.   get dst value location in tensor
+    // 6.   find, which input tile holds the dst value
+    // 7.   subtract dst coordinates and start coordinates of the tile
+    // 8.   find source register number which holds dst value
+    // 9.   copy dst element from computed tile and register
+    auto ctx = rewriter.getContext();
+    StringAttr kReg = StringAttr::get(ctx, "register");
+    auto srcRegBases = linearLayoutSrc.getBases().lookup(kReg);
+    auto dstRegBases = linearLayoutDst.getBases().lookup(kReg);
+
+    using ElemLocationKey = decltype(linearLayoutSrc.apply({}));
+    llvm::MapVector<ElemLocationKey, unsigned> srcElemToReg;
+    int srcRegNum = 1 << srcRegBases.size();
+    // 1. for all registers in src tensor
+    for (int regId = 0; regId < srcRegNum; ++regId) {
+      // 2.   compute src location in tensor relative to tile beginnig
+      SmallVector<std::pair<StringAttr, int32_t>> hardwareLocation;
+      for (auto dimName : linearLayoutSrc.getInDimNames()) {
+        if (dimName == kReg)
+          hardwareLocation.push_back({dimName, regId});
+        else
+          hardwareLocation.push_back({dimName, 0});
+      }
+      auto elemCoords = linearLayoutSrc.apply(hardwareLocation);
+      // 3.  save mapping from src elem coordinates to register idx
+      srcElemToReg[elemCoords] = regId;
+    }
+    // for every output register get element coords,
+    // find corresponding operand and copy src register
+    int dstRegNum = 1 << dstRegBases.size();
+    // 4. for all elements in dst tensor
+    for (int regId = 0; regId < dstRegNum; ++regId) {
+      SmallVector<std::pair<StringAttr, int32_t>> hardwareLocation;
+      // 5.   get dst value location in tensor
+      for (auto dimName : linearLayoutDst.getInDimNames()) {
+        if (dimName == kReg)
+          hardwareLocation.push_back({dimName, regId});
+        else
+          hardwareLocation.push_back({dimName, 0});
+      }
+      auto elemCoords = linearLayoutDst.apply(hardwareLocation);
+      auto elemCoordsArray =
+          llvm::to_vector(llvm::make_second_range(elemCoords));
       // The n-dim destination tensor is built by arranging n-dim source tensors
       // into a destination tensor shape. Determine which source tensor contains
       // the current CTA tile.
-      auto multiDimSrcIdx = LLVM::AMD::multiDimElementwise<unsigned, unsigned>(
-          currTileIdx, srcCTAShape, std::divides<unsigned>());
+      auto multiDimOperandIdx =
+          LLVM::AMD::multiDimElementwise<int32_t, int64_t>(
+              elemCoordsArray, srcShape, std::divides<unsigned>());
       // Compute linear index of the current source tensor.
       // Concat operands are laid out in the destination tensor
       // in fastest slowest varying dimension order.
-      auto linearSrcIdx =
-          mlir::LLVM::linearize(multiDimSrcIdx, srcToDstShape, defaultOrder);
-
-      // After determining which source tensor the current CTA tile belongs to,
-      // compute the index of this CTA tile within that source tensor,
-      // considering the source tensors may include CTA tiles.
-      auto multiDimSrcCTAIdx =
-          LLVM::AMD::multiDimElementwise<unsigned, unsigned>(
-              currTileIdx, srcCTAShape, std::modulus<unsigned>());
-      auto linearSrcCTAIdx =
-          mlir::LLVM::linearize(multiDimSrcCTAIdx, srcCTAShape, srcCTAOrder);
-      auto unpackedElements = unpackedSources[linearSrcIdx];
-
-      auto startIt =
-          unpackedElements.begin() + linearSrcCTAIdx * elemsPerThreadPerCTA;
-      auto endIt = startIt + elemsPerThreadPerCTA;
-      llvm::append_range(resultVals, llvm::make_range(startIt, endIt));
+      // 6.   find, which input tile holds the dst value
+      auto linearOperandIdx = mlir::LLVM::linearize(
+          multiDimOperandIdx, srcToDstShape, defaultOrder);
+
+      // 7.   subtract dst coordinates and start coordinates of the tile
+      for (int dim = 0; dim < rank; ++dim)
+        elemCoords[dim].second -= multiDimOperandIdx[dim] * srcShape[dim];
+
+      assert(srcElemToReg.contains(elemCoords));
+      // 8.   find source register number which holds dst value
+      int srcRegIdx = srcElemToReg.lookup(elemCoords);
+
+      // 9.   copy dst element from found tile and register
+      resultVals.push_back(unpackedSources[linearOperandIdx][srcRegIdx]);
     }
 
     Value packedResult = packLLElements(loc, this->getTypeConverter(),
diff --git a/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ExtractSliceOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ExtractSliceOpToLLVM.cpp
diff --git a/third_party/amd/python/test/test_extract_slice_concat_op.py b/third_party/amd/python/test/test_extract_slice_concat_op.py