intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 2 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 81 additions & 63 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 81 additions & 63 deletions
diff --git a/‎test/Conversion/amd/ds_transpose.mlir‎
Lines changed: 68 additions & 0 deletions b/‎test/Conversion/amd/ds_transpose.mlir‎
Lines changed: 68 additions & 0 deletions
@@ -264,8 +264,8 @@ LinearLayout chooseLdMatrixLayout(Attribute enc, ArrayRef<int64_t> shape,
 
 // The primary goal of this function is to efficiently load 2D tiles of a
 // tensor from shared memory using the `ds_read_tr` instruction for AMD GPUs.
-LinearLayout chooseDsReadB64Tr16Layout(Attribute enc, ArrayRef<int64_t> shape,
-                                       int32_t elemBitWidth);
+LinearLayout chooseDsReadB64TrLayout(Attribute enc, ArrayRef<int64_t> shape,
+                                     int32_t elemBitWidth);
 
 // Create LinearLayout for mxfp4 and mxfp8 operand in scaled mfma.
 // For mxfp4, we use dot layout directly. Mxfp8 is not covered by dot
 
@@ -393,12 +393,12 @@ AMDMfmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   return combineCtaCgaWithShape(ctaLayout, getCTALayout(), shape);
 }
 
-LinearLayout chooseDotDsReadB64Tr16Layout(DotOperandEncodingAttr dotMfmaLayout,
-                                          ArrayRef<int64_t> shape,
-                                          int32_t elemBitWidth) {
+LinearLayout chooseDotDsReadB64TrLayout(DotOperandEncodingAttr dotMfmaLayout,
+                                        ArrayRef<int64_t> shape,
+                                        int32_t elemBitWidth) {
   auto mfmaLayout = llvm::cast<AMDMfmaEncodingAttr>(dotMfmaLayout.getParent());
   assert(mfmaLayout.getMDim() == 16 || mfmaLayout.getNDim() == 32);
-  assert(elemBitWidth == 16);
+  assert(elemBitWidth == 16 || elemBitWidth == 8);
 
   auto rank = shape.size();
   bool hasBatchDim = rank == 3;
@@ -407,6 +407,7 @@ LinearLayout chooseDotDsReadB64Tr16Layout(DotOperandEncodingAttr dotMfmaLayout,
   // loads for most element sizes (16b, 8b, 4b).
   const int32_t ldsReadWidth = 64;
   int32_t kWidthTransRead = ldsReadWidth / elemBitWidth;
+  const int elemByteWidth = elemBitWidth / 8;
   auto kDim = dotMfmaLayout.getOpIdx() == 0 ? rank - 1 : rank - 2;
 
   int32_t kSize = shape[kDim];
@@ -427,72 +428,92 @@ LinearLayout chooseDotDsReadB64Tr16Layout(DotOperandEncodingAttr dotMfmaLayout,
   SmallVector<unsigned> order = dotMfmaLayout.getDefaultOrder();
   std::swap(order[0], order[1]);
 
-  // In the LDS transpose logic, each thread accesses 64 bits (8 bytes) of data.
-  // The smallest unit for transposing is a 4x4 sub-tile of threads, where each
-  // thread reads 4 16-bit elements along the non-K dimension, resulting in a
-  // [non-K, K] = {16, 4}  sub-tile of elements. Because of transposing
-  // mechanism, thread ends up with 4 16-bit elements along K dim.
+  // For ds_read_b64_tr_* instructions, each thread accesses 64 bits (8 bytes)
+  // of data. The smallest unit for transposition is a
+  // [non-K, K] = {16, kWidthTransRead} sub-tile of elements,
+  // where each thread reads kWidthTransRead elements along the non-K dimension.
+  // Due to the transposition mechanism, each thread ends up with
+  // kWidthTransRead elements along the K dimension.
   //
   // The MFMA selection logic prioritizes double-rate MFMA instructions whenever
-  // possible. Specifically:
-  // - For MFMA operations that are non-K = 16, when blockK > 16, mfma16x16x32
-  // is selected; otherwise (blockK ≤ 16), mfma16x16x16 remains the choice.
-  // - For MFMA operations that are non-K = 32, when blockK > 8, mfma32x32x16 is
-  // selected; otherwise (blockK ≤ 8), mfma32x32x8 is used.
+  // possible:
   //
-  // In double-rate MFMA instructions, each thread holds 8 elements along the K
-  // dimension.
-  // - The first 4 elements belong to the first sub-tile.
-  // - The next 4 elements belong to the second sub-tile.
+  // - For MFMA operations where M = N = 16, when blockK > k, mfma16x16x2*k
+  //   is selected; otherwise (blockK ≤ k), mfma16x16xk remains the choice.
   //
-  // We then group these into larger tiles, each consisting of 8 of these 16x4
-  // sub-tiles. These tiles correspond to data for one mfma instruction. The
-  // shapes of these tiles depend on the MFMA instruction used:
-  // 1. For mfma32x32x16, the tile shape is [non-K, K] = {32, 16}.
-  // 2. For mfma16x16x32, the tile shape is [non-K, K] = {16, 32}.
+  // - For MFMA operations where M = N = 32, when blockK > k, mfma32x32x2*k is
+  //   selected; otherwise (blockK ≤ k), mfma32x32xk is used.
   //
-  // For single-rate mfma instructions, each thread holds 4 elements along K
-  // dimension. This means larger tile (that corresponds to one mfma
-  // instruction) consists of 4 16x4 sub-tiles.
-  std::vector<std::vector<int32_t>> registerBase = {{1, 0},
-                                                    {2, 0}}; // first sub-tile
-  std::vector<std::vector<int32_t>> laneBase = {{kWidthTransRead, 0},
-                                                {2 * kWidthTransRead, 0},
-                                                {0, 1},
-                                                {0, 2}}; // first sub-tile
-
-  // Extend register base for multiple tiles in K dimension (corresponding to
-  // multiple mfma instructions accross k dim).
-  auto populateRegisterBase = [&](int kTileSize) {
-    const int regsPerTile = 8;
-    int numRegs = (kSize / kTileSize) * regsPerTile;
-    for (int reg = regsPerTile; reg < numRegs; reg *= 2) {
+  // NOTE: For fp8 and fp4, "double-rate" results in 4*k since scaled MFMA
+  // instructions are used.
+  //
+  // In "double-rate" MFMA instructions, each thread holds 2*kWidthTransRead
+  // elements along the K dimension:
+  // - The first kWidthTransRead elements belong to the first sub-tile.
+  // - The next kWidthTransRead elements belong to the second sub-tile.
+  //
+  // These elements are then grouped into larger tiles, each consisting of
+  // 8 {16, kWidthTransRead} sub-tiles. These tiles correspond to the data
+  // for one MFMA instruction. The shape of these tiles depends on the MFMA
+  // instruction used.
+  //
+  // For single-rate MFMA instructions, each thread holds kWidthTransRead
+  // elements along the K dimension. This means that the larger tile
+  // (corresponding to one MFMA instruction) consists of 4 {16, kWidthTransRead}
+  // sub-tiles.
+  std::vector<std::vector<int32_t>> registerBase;
+  std::vector<std::vector<int32_t>> laneBase;
+
+  // Populate register base for first subtile
+  for (int i = 1; i < kWidthTransRead; i *= 2) {
+    registerBase.push_back({i, 0});
+  }
+
+  const int threadsPerSubtileNonK = 16 / kWidthTransRead;
+  const int threadsPerSubtileK = kWidthTransRead;
+
+  // Populate lane base for first subtile
+  for (int i = 1; i < threadsPerSubtileNonK; i *= 2) {
+    laneBase.push_back({i * kWidthTransRead, 0});
+  }
+  for (int i = 1; i < threadsPerSubtileK; i *= 2) {
+    laneBase.push_back({0, i});
+  }
+
+  // Function to extend register base for multiple tiles K dim.
+  auto extendRegisterBaseForKDim = [&](int kTileSize) {
+    const int regsPerTile = kWidthTransRead * 2; // Two subtiles per tile
+    int totalRegs = (kSize / kTileSize) * regsPerTile;
+
+    for (int reg = regsPerTile; reg < totalRegs; reg *= 2) {
       registerBase.push_back({0, (reg / regsPerTile) * kTileSize});
     }
   };
 
   const bool isMfma32 = (mfmaLayout.getMDim() == 32);
   const bool isMfma16 = (mfmaLayout.getMDim() == 16);
-  const int kTileSize = isMfma32 ? 16 : 32;
-
-  if (kSize >= kTileSize) {
-    // Handles mfma32x32x16 and mfma16x16x32 cases
-    assert(kWidthDot == 8);
-    registerBase.push_back({0, 4}); // second sub-tile
-    populateRegisterBase(kTileSize);
-    auto laneBaseExt = isMfma32
-                           ? std::vector<std::vector<int32_t>>{{16, 0}, {0, 8}}
-                           : std::vector<std::vector<int32_t>>{{0, 8}, {0, 16}};
-    laneBase.insert(laneBase.end(), laneBaseExt.begin(), laneBaseExt.end());
+  const int kTileSize = isMfma32 ? 32 / elemByteWidth : 64 / elemByteWidth;
+  const bool largeKSize = kSize >= kTileSize;
+
+  // Extend register base for large K sizes.
+  if (largeKSize) {
+    registerBase.push_back({0, threadsPerSubtileK}); // Second subtile
+    extendRegisterBaseForKDim(kTileSize);
+  }
+
+  // Extend lane base based on MFMA size.
+  const int numSubtilesPerTile = largeKSize ? 2 : 1;
+  std::vector<std::vector<int32_t>> laneBaseExt;
+
+  if (isMfma32) {
+    laneBaseExt = {{16, 0}, {0, numSubtilesPerTile * threadsPerSubtileK}};
   } else {
-    // Handles mfma32x32x8 and mfma16x16x16 cases
-    assert(kWidthDot == 4);
-    auto laneBaseExt = isMfma32
-                           ? std::vector<std::vector<int32_t>>{{16, 0}, {0, 4}}
-                           : std::vector<std::vector<int32_t>>{{0, 4}, {0, 8}};
-    laneBase.insert(laneBase.end(), laneBaseExt.begin(), laneBaseExt.end());
+    laneBaseExt = {{0, numSubtilesPerTile * threadsPerSubtileK},
+                   {0, 2 * numSubtilesPerTile * threadsPerSubtileK}};
   }
 
+  laneBase.insert(laneBase.end(), laneBaseExt.begin(), laneBaseExt.end());
+
   // Base vectors above are defined in a fixed order [non-k-dim, k-dim].
   // To assign them to actual matrix dimensions `order` array is used.
   // For operand A: non-k-dim -> dim0, k-dim -> dim1
@@ -516,10 +537,7 @@ LinearLayout chooseDotDsReadB64Tr16Layout(DotOperandEncodingAttr dotMfmaLayout,
 
   LinearLayout ctaLayout = tileLayout.transposeOuts(outDimNames) *
                            warpLayout.transposeOuts(outDimNames);
-  auto finalLayout =
-      combineCtaCgaWithShape(ctaLayout, mfmaLayout.getCTALayout(), shape);
-
-  return finalLayout;
+  return combineCtaCgaWithShape(ctaLayout, mfmaLayout.getCTALayout(), shape);
 }
 
 LinearLayout mfmaDotToLinearLayout(DotOperandEncodingAttr dotMfmaLayout,
@@ -1334,10 +1352,10 @@ LinearLayout chooseLdMatrixLayout(Attribute enc, ArrayRef<int64_t> shape,
   return chooseDotLdMatrixLayout(dot, shape, needTrans, elemBitWidth);
 }
 
-LinearLayout chooseDsReadB64Tr16Layout(Attribute enc, ArrayRef<int64_t> shape,
-                                       int32_t elemBitWidth) {
+LinearLayout chooseDsReadB64TrLayout(Attribute enc, ArrayRef<int64_t> shape,
+                                     int32_t elemBitWidth) {
   auto dot = cast<DotOperandEncodingAttr>(enc);
-  return chooseDotDsReadB64Tr16Layout(dot, shape, elemBitWidth);
+  return chooseDotDsReadB64TrLayout(dot, shape, elemBitWidth);
 }
 
 LinearLayout
 
@@ -74,4 +74,72 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xf16, #shared, #smem, mutable> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 8}>>
     tt.return
   }
+
+  //  CHECK-LABEL: ds_transpose_n_t_i8_mfma_16
+  tt.func @ds_transpose_n_t_i8_mfma_16(%arg0: !ttg.memdesc<128x64xi8, #shared, #smem, mutable>, %arg1: !ttg.memdesc<64x128xi8, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-16: rocdl.ds.read.tr8.b64 %{{.*}} : <3> -> vector<2xi32>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xi8, #shared, #smem, mutable> -> tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 16}>>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xi8, #shared1, #smem, mutable> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma16, kWidth = 16}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_t_t_i8_mfma_16
+  tt.func @ds_transpose_t_t_i8_mfma_16(%arg0: !ttg.memdesc<128x64xi8, #shared1, #smem, mutable>, %arg1: !ttg.memdesc<64x128xi8, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-4: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<16xi8>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xi8, #shared1, #smem, mutable> -> tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 16}>>
+    // CHECK-COUNT-8: rocdl.ds.read.tr8.b64 %{{.*}} : <3> -> vector<2xi32>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xi8, #shared1, #smem, mutable> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma16, kWidth = 16}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_n_n_i8_mfma_16
+  tt.func @ds_transpose_n_n_i8_mfma_16(%arg0: !ttg.memdesc<128x64xi8, #shared, #smem, mutable>, %arg1: !ttg.memdesc<64x128xi8, #shared, #smem, mutable>) {
+    // CHECK-COUNT-8: rocdl.ds.read.tr8.b64 %{{.*}} : <3> -> vector<2xi32>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xi8, #shared, #smem, mutable> -> tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 16}>>
+    // CHECK-COUNT-4: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<16xi8>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xi8, #shared, #smem, mutable> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma16, kWidth = 16}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_t_n_i8_mfma_16
+  tt.func @ds_transpose_t_n_i8_mfma_16(%arg0: !ttg.memdesc<128x64xi8, #shared1, #smem, mutable>, %arg1: !ttg.memdesc<64x128xi8, #shared, #smem, mutable>) {
+    // CHECK-NOT: rocdl.ds.read.tr8.b64 %{{.*}} : <3> -> vector<2xi32>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xi8, #shared1, #smem, mutable> -> tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 16}>>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xi8, #shared, #smem, mutable> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma16, kWidth = 16}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_n_t_i8_mfma32
+  tt.func @ds_transpose_n_t_i8_mfma32(%arg0: !ttg.memdesc<128x64xi8, #shared, #smem, mutable>, %arg1: !ttg.memdesc<64x128xi8, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-16: rocdl.ds.read.tr8.b64 %{{.*}} : <3> -> vector<2xi32>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xi8, #shared, #smem, mutable> -> tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 16}>>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xi8, #shared1, #smem, mutable> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 16}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_t_t_i8_mfma32
+  tt.func @ds_transpose_t_t_i8_mfma32(%arg0: !ttg.memdesc<128x64xi8, #shared1, #smem, mutable>, %arg1: !ttg.memdesc<64x128xi8, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-4: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<16xi8>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xi8, #shared1, #smem, mutable> -> tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 16}>>
+    // CHECK-COUNT-6: rocdl.ds.read.tr8.b64 %{{.*}} : <3> -> vector<2xi32>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xi8, #shared1, #smem, mutable> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 16}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_n_n_i8_mfma32
+  tt.func @ds_transpose_n_n_i8_mfma32(%arg0: !ttg.memdesc<128x64xi8, #shared, #smem, mutable>, %arg1: !ttg.memdesc<64x128xi8, #shared, #smem, mutable>) {
+    // CHECK-COUNT-8: rocdl.ds.read.tr8.b64 %{{.*}} : <3> -> vector<2xi32>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xi8, #shared, #smem, mutable> -> tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 16}>>
+    // CHECK-COUNT-4: llvm.load %{{.*}} : !llvm.ptr<3> -> vector<16xi8>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xi8, #shared, #smem, mutable> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 16}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_t_n_i8_mfma32
+  tt.func @ds_transpose_t_n_i8_mfma32(%arg0: !ttg.memdesc<128x64xi8, #shared1, #smem, mutable>, %arg1: !ttg.memdesc<64x128xi8, #shared, #smem, mutable>) {
+    // CHECK-NOT: rocdl.ds.read.tr8.b64 %{{.*}} : <3> -> vector<2xi32>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x64xi8, #shared1, #smem, mutable> -> tensor<128x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 16}>>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<64x128xi8, #shared, #smem, mutable> -> tensor<64x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 16}>>
+    tt.return
+  }
 }