intel
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
Lines changed: 140 additions & 90 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
Lines changed: 140 additions & 90 deletions
diff --git a/‎test/Conversion/amd/ds_transpose.mlir
Lines changed: 27 additions & 0 deletions b/‎test/Conversion/amd/ds_transpose.mlir
Lines changed: 27 additions & 0 deletions
diff --git a/‎test/TritonGPU/amd/invalid.mlir
Lines changed: 34 additions & 0 deletions b/‎test/TritonGPU/amd/invalid.mlir
Lines changed: 34 additions & 0 deletions
diff --git a/‎third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
Lines changed: 28 additions & 0 deletions b/‎third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
Lines changed: 28 additions & 0 deletions
@@ -576,16 +576,21 @@ LinearLayout chooseDotDsReadB64TrLayout(DotOperandEncodingAttr dotMfmaLayout,
   auto mfmaLayout = llvm::cast<AMDMfmaEncodingAttr>(dotMfmaLayout.getParent());
   auto mDim = mfmaLayout.getMDim();
   assert(mDim == 16 || mDim == 32);
+
+  bool isFP4 = false;
+  if (elemBitWidth == 4) {
+    // When doing ds_read_tr4 we actually write the LL as if it were on i8
+    // elements this is becasue LL needs to be described for the i8 tensor
+    // elements.
+    elemBitWidth = 8;
+    isFP4 = true;
+  }
+
   assert(elemBitWidth == 16 || elemBitWidth == 8);
 
   auto rank = shape.size();
   bool hasBatchDim = rank == 3;
   int32_t kWidthDot = dotMfmaLayout.getKWidth();
-  // Number of bits loaded by an LDS read. ds_read_tr primarily supports 64-bit
-  // loads for most element sizes (16b, 8b, 4b).
-  const int32_t ldsReadWidth = 64;
-  int32_t kWidthTransRead = ldsReadWidth / elemBitWidth;
-  const int elemByteWidth = elemBitWidth / 8;
   auto kDim = dotMfmaLayout.getOpIdx() == 0 ? rank - 1 : rank - 2;
 
   int32_t kSize = shape[kDim];
@@ -606,106 +611,151 @@ LinearLayout chooseDotDsReadB64TrLayout(DotOperandEncodingAttr dotMfmaLayout,
   SmallVector<unsigned> order =
       getOrderForDotOperand(dotMfmaLayout.getOpIdx(), rank, /*kContig*/ false);
 
-  // For ds_read_b64_tr_* instructions, each thread accesses 64 bits (8 bytes)
-  // of data. The smallest unit for transposition is a
-  // [non-K, K] = {16, kWidthTransRead} sub-tile of elements,
-  // where each thread reads kWidthTransRead elements along the non-K dimension.
-  // Due to the transposition mechanism, each thread ends up with
-  // kWidthTransRead elements along the K dimension.
-  //
-  // The MFMA selection logic prioritizes double-rate MFMA instructions whenever
-  // possible:
-  //
-  // - For MFMA operations where M = N = 16, when blockK > k, mfma16x16x2*k
-  //   is selected; otherwise (blockK ≤ k), mfma16x16xk remains the choice.
-  //
-  // - For MFMA operations where M = N = 32, when blockK > k, mfma32x32x2*k is
-  //   selected; otherwise (blockK ≤ k), mfma32x32xk is used.
-  //
-  // NOTE: For fp8 and fp4, "double-rate" results in 4*k since scaled MFMA
-  // instructions are used.
-  //
-  // In "double-rate" MFMA instructions, each thread holds 2*kWidthTransRead
-  // elements along the K dimension:
-  // - The first kWidthTransRead elements belong to the first sub-tile.
-  // - The next kWidthTransRead elements belong to the second sub-tile.
-  //
-  // These elements are then grouped into larger tiles, each consisting of
-  // 8 {16, kWidthTransRead} sub-tiles. These tiles correspond to the data
-  // for one MFMA instruction. The shape of these tiles depends on the MFMA
-  // instruction used.
-  //
-  // For single-rate MFMA instructions, each thread holds kWidthTransRead
-  // elements along the K dimension. This means that the larger tile
-  // (corresponding to one MFMA instruction) consists of 4 {16, kWidthTransRead}
-  // sub-tiles.
   std::vector<std::vector<int32_t>> registerBase;
   std::vector<std::vector<int32_t>> laneBase;
+  auto populateFP4LL = [&registerBase, &laneBase](int kSize, int mDim) {
+    const bool isMfma32 = (mDim == 32);
+    // ds_read_b64_tr4 operates on FP4 values swapping the packing of them. Look
+    // at i8 values for the ownership of register/lane since it's the data type
+    // of the tensor. Register dimension: what i8 in the tile are held by thread
+    // 0? Lane dimension: what i8 in the tile are held in register 0 of each
+    // thread?
+    registerBase.push_back({1, 0});
+    registerBase.push_back({2, 0});
+    registerBase.push_back({4, 0});
+    registerBase.push_back({0, 16});
+
+    // If more than one tile needs to be loaded, populate registerBase
+    // dimension for the other tiles
+    const int kTileSize = isMfma32 ? 64 : 128;
+    for (int reg = kTileSize; reg < kSize; reg *= 2) {
+      registerBase.push_back({0, reg});
+    }
 
-  // Populate register base for first subtile
-  for (int i = 1; i < kWidthTransRead; i *= 2) {
-    registerBase.push_back({i, 0});
-  }
-
-  const int threadsPerSubtileNonK = 16 / kWidthTransRead;
-  const int threadsPerSubtileK = kWidthTransRead;
-
-  // Populate lane base for first subtile
-  for (int i = 1; i < threadsPerSubtileNonK; i *= 2) {
-    laneBase.push_back({i * kWidthTransRead, 0});
-  }
-  for (int i = 1; i < threadsPerSubtileK; i *= 2) {
-    laneBase.push_back({0, i});
-  }
-
-  // Function to extend register base for multiple tiles K dim.
-  auto extendRegisterBaseForKDim = [&](int kTileSize, int numSubtilesPerTile) {
-    const int regsPerTile = kWidthTransRead * numSubtilesPerTile;
-    int totalRegs = (kSize / kTileSize) * regsPerTile;
-
-    for (int reg = regsPerTile; reg < totalRegs; reg *= 2) {
-      registerBase.push_back({0, (reg / regsPerTile) * kTileSize});
+    // When mDim == 16 we have 16x128 mfma, otherwise it's 16x64
+    // The LL for the two is different
+    laneBase.push_back({0, 1});
+    laneBase.push_back({0, 2});
+    laneBase.push_back({0, 4});
+    laneBase.push_back({0, 8});
+    if (mDim == 16) {
+      laneBase.push_back({0, 32});
+      laneBase.push_back({0, 64});
+    } else {
+      assert(mDim == 32);
+      laneBase.push_back({8, 0});
+      laneBase.push_back({0, 32});
     }
   };
+  auto populateLL = [&registerBase, &laneBase](int elemBitWidth, int kSize,
+                                               int kWidthDot, int mDim) {
+    // Number of bits loaded by an LDS read. ds_read_tr primarily supports
+    // 64-bit loads for most element sizes (16b, 8b, 4b).
+    const int32_t ldsReadWidth = 64;
+    int32_t kWidthTransRead = ldsReadWidth / elemBitWidth;
+    const int elemByteWidth = elemBitWidth / 8;
+    const bool isMfma32 = (mDim == 32);
+
+    // For ds_read_b64_tr_* instructions, each thread accesses 64 bits (8 bytes)
+    // of data. The smallest unit for transposition is a
+    // [non-K, K] = {16, kWidthTransRead} sub-tile of elements,
+    // where each thread reads kWidthTransRead elements along the non-K
+    // dimension. Due to the transposition mechanism, each thread ends up with
+    // kWidthTransRead elements along the K dimension.
+    //
+    // The MFMA selection logic prioritizes double-rate MFMA instructions
+    // whenever possible:
+    //
+    // - For MFMA operations where M = N = 16, when blockK > k, mfma16x16x2*k
+    //   is selected; otherwise (blockK ≤ k), mfma16x16xk remains the choice.
+    //
+    // - For MFMA operations where M = N = 32, when blockK > k, mfma32x32x2*k is
+    //   selected; otherwise (blockK ≤ k), mfma32x32xk is used.
+    //
+    // NOTE: For fp8 and fp4, "double-rate" results in 4*k since scaled MFMA
+    // instructions are used.
+    //
+    // In "double-rate" MFMA instructions, each thread holds 2*kWidthTransRead
+    // elements along the K dimension:
+    // - The first kWidthTransRead elements belong to the first sub-tile.
+    // - The next kWidthTransRead elements belong to the second sub-tile.
+    //
+    // These elements are then grouped into larger tiles, each consisting of
+    // 8 {16, kWidthTransRead} sub-tiles. These tiles correspond to the data
+    // for one MFMA instruction. The shape of these tiles depends on the MFMA
+    // instruction used.
+    //
+    // For single-rate MFMA instructions, each thread holds kWidthTransRead
+    // elements along the K dimension. This means that the larger tile
+    // (corresponding to one MFMA instruction) consists of 4 {16,
+    // kWidthTransRead} sub-tiles.
+
+    // Populate register base for first subtile
+    for (int i = 1; i < kWidthTransRead; i *= 2) {
+      registerBase.push_back({i, 0});
+    }
 
-  const bool isMfma32 = (mDim == 32);
-  const bool isMfma16 = (mDim == 16);
-
-  // kDoubleTileSize is the k dimension of a tile when double rated
-  // mfma instructions are used.
-  const int kDoubleTileSize =
-      isMfma32 ? 32 / elemByteWidth : 64 / elemByteWidth;
-  // kTileSize is the actually k dimention of a tile, which is
-  // determined by kWidthDot.
-  const int kTileSize = kWidthDot * 64 / mDim;
-  // We use kDoubleTileSize as a reference to check whether the given
-  // kWidthDot leads to double or single sub-tiles in each tile.
-  const int numSubtilesPerTile = (kTileSize == kDoubleTileSize) ? 2 : 1;
-
-  // Extend register base for large K sizes.
-  if (numSubtilesPerTile == 2)
-    registerBase.push_back({0, threadsPerSubtileK}); // Second subtile
-
-  extendRegisterBaseForKDim(kTileSize, numSubtilesPerTile);
+    const int threadsPerSubtileNonK = 16 / kWidthTransRead;
+    const int threadsPerSubtileK = kWidthTransRead;
 
-  // Extend lane base based on MFMA size.
-  std::vector<std::vector<int32_t>> laneBaseExt;
+    // Populate lane base for first subtile
+    for (int i = 1; i < threadsPerSubtileNonK; i *= 2) {
+      laneBase.push_back({i * kWidthTransRead, 0});
+    }
+    for (int i = 1; i < threadsPerSubtileK; i *= 2) {
+      laneBase.push_back({0, i});
+    }
 
-  if (isMfma32) {
-    laneBaseExt = {{16, 0}, {0, numSubtilesPerTile * threadsPerSubtileK}};
-  } else {
-    laneBaseExt = {{0, numSubtilesPerTile * threadsPerSubtileK},
-                   {0, 2 * numSubtilesPerTile * threadsPerSubtileK}};
-  }
+    // Function to extend register base for multiple tiles K dim.
+    auto extendRegisterBaseForKDim = [&](int kTileSize,
+                                         int numSubtilesPerTile) {
+      const int regsPerTile = kWidthTransRead * numSubtilesPerTile;
+      int totalRegs = (kSize / kTileSize) * regsPerTile;
+
+      for (int reg = regsPerTile; reg < totalRegs; reg *= 2) {
+        registerBase.push_back({0, (reg / regsPerTile) * kTileSize});
+      }
+    };
+
+    // kDoubleTileSize is the k dimension of a tile when double rated
+    // mfma instructions are used.
+    const int kDoubleTileSize =
+        isMfma32 ? 32 / elemByteWidth : 64 / elemByteWidth;
+    // kTileSize is the actually k dimention of a tile, which is
+    // determined by kWidthDot.
+    const int kTileSize = kWidthDot * 64 / mDim;
+    // We use kDoubleTileSize as a reference to check whether the given
+    // kWidthDot leads to double or single sub-tiles in each tile.
+    const int numSubtilesPerTile = (kTileSize == kDoubleTileSize) ? 2 : 1;
+
+    // Extend register base for large K sizes.
+    if (numSubtilesPerTile == 2)
+      registerBase.push_back({0, threadsPerSubtileK}); // Second subtile
+
+    extendRegisterBaseForKDim(kTileSize, numSubtilesPerTile);
+
+    // Extend lane base based on MFMA size.
+    std::vector<std::vector<int32_t>> laneBaseExt;
+
+    if (isMfma32) {
+      laneBaseExt = {{16, 0}, {0, numSubtilesPerTile * threadsPerSubtileK}};
+    } else {
+      laneBaseExt = {{0, numSubtilesPerTile * threadsPerSubtileK},
+                     {0, 2 * numSubtilesPerTile * threadsPerSubtileK}};
+    }
+    laneBase.insert(laneBase.end(), laneBaseExt.begin(), laneBaseExt.end());
+  };
 
-  laneBase.insert(laneBase.end(), laneBaseExt.begin(), laneBaseExt.end());
+  if (isFP4)
+    populateFP4LL(kSize, mDim);
+  else
+    populateLL(elemBitWidth, kSize, kWidthDot, mDim);
 
   // Base vectors above are defined in a fixed order [non-k-dim, k-dim].
   // To assign them to actual matrix dimensions we associate with register
   // `order` which is also [nonk, k] given we set kContig to false.
   LinearLayout tileLayout({{kRegister, registerBase}, {kLane, laneBase}},
                           {outDimNames[order[0]], outDimNames[order[1]]});
-
   if (hasBatchDim) {
     assert(order[2] == 0);
     // Extend the base vector with one value to accommodate for the batch
 
@@ -378,4 +378,31 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     ttg.local_store %3, %arg2 : tensor<128x128xf32, #mma32> -> !ttg.memdesc<128x128xf32, #shared1, #smem, mutable>
     tt.return
   }
+
+  //  CHECK-LABEL: ds_transpose_t_fp4_mfma32_small
+  tt.func @ds_transpose_t_fp4_mfma32_small(%arg0: !ttg.memdesc<16x64xi8, #shared, #smem, mutable>, %arg1: !ttg.memdesc<64x16xi8, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-4: rocdl.ds.read.tr4.b64 %{{.*}} : <3> -> vector<2xi32>
+    // CHECK-NOT: rocdl.ds.read.tr4.b64
+    %1 = amdgpu.local_load_packed_tranposed %arg0 : !ttg.memdesc<16x64xi8, #shared, #smem, mutable> -> tensor<32x32xi8, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 16}>>
+    %2 = amdgpu.local_load_packed_tranposed %arg1 : !ttg.memdesc<64x16xi8, #shared1, #smem, mutable> -> tensor<32x32xi8, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 16}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_t_fp4_mfma16
+  tt.func @ds_transpose_t_fp4_mfma16(%arg0: !ttg.memdesc<8x128xi8, #shared, #smem, mutable>, %arg1: !ttg.memdesc<128x8xi8, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-4: rocdl.ds.read.tr4.b64 %{{.*}} : <3> -> vector<2xi32>
+    // CHECK-NOT: rocdl.ds.read.tr4.b64
+    %1 = amdgpu.local_load_packed_tranposed %arg0 : !ttg.memdesc<8x128xi8, #shared, #smem, mutable> -> tensor<16x64xi8, #ttg.dot_op<{opIdx = 0, parent = #mma16, kWidth = 16}>>
+    %2 = amdgpu.local_load_packed_tranposed %arg1 : !ttg.memdesc<128x8xi8, #shared1, #smem, mutable> -> tensor<64x16xi8, #ttg.dot_op<{opIdx = 1, parent = #mma16, kWidth = 16}>>
+    tt.return
+  }
+
+  //  CHECK-LABEL: ds_transpose_t_fp4_mfma32
+  tt.func @ds_transpose_t_fp4_mfma32(%arg0: !ttg.memdesc<256x256xi8, #shared, #smem, mutable>, %arg1: !ttg.memdesc<256x256xi8, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-128: rocdl.ds.read.tr4.b64 %{{.*}} : <3> -> vector<2xi32>
+    // CHECK-NOT: rocdl.ds.read.tr4.b64
+    %1 = amdgpu.local_load_packed_tranposed %arg0 : !ttg.memdesc<256x256xi8, #shared, #smem, mutable> -> tensor<512x128xi8, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 16}>>
+    %2 = amdgpu.local_load_packed_tranposed %arg1 : !ttg.memdesc<256x256xi8, #shared1, #smem, mutable> -> tensor<128x512xi8, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 16}>>
+    tt.return
+  }
 }
@@ -93,3 +93,37 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
     tt.return
   }
 }
+
+// -----
+
+#mma32 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [2, 2], instrShape = [32, 32], isTransposed = true}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
+#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [8, 8], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @local_load_packed_tranposed_wrong_op_idx(%arg0: !ttg.memdesc<16x64xi8, #shared, #smem, mutable>, %arg1: !ttg.memdesc<64x16xi8, #shared1, #smem, mutable>) {
+// expected-error @+1 {{Order of dimensions don't match expected}}
+    %1 = amdgpu.local_load_packed_tranposed %arg0 : !ttg.memdesc<16x64xi8, #shared, #smem, mutable> -> tensor<32x32xi8, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 16}>>
+    tt.return
+  }
+
+  tt.func @local_load_packed_tranposed_wrong_op_idx2(%arg0: !ttg.memdesc<64x16xi8, #shared, #smem, mutable>) {
+// expected-error @+1 {{Input and output dimensions don't match after packing changes}}
+    %1 = amdgpu.local_load_packed_tranposed %arg0 : !ttg.memdesc<64x16xi8, #shared, #smem, mutable> -> tensor<32x32xi8, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 16}>>
+    tt.return
+  }
+  tt.func @local_load_packed_tranposed_wrong_attr(%arg1: !ttg.memdesc<128x8xi8, #blocked, #smem, mutable>) {
+// expected-error @+1 {{only works with SwizzledSharedEncodingAttr src encoding}}
+    %1 = amdgpu.local_load_packed_tranposed %arg1 : !ttg.memdesc<128x8xi8, #blocked, #smem, mutable> -> tensor<64x16xi8, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 16}>>
+    tt.return
+  }
+  //  CHECK-LABEL: ds_transpose_t_fp4_mfma16
+  tt.func @local_load_packed_tranposed_wrong_shape(%arg0: !ttg.memdesc<8x128xi8, #shared, #smem, mutable>, %arg1: !ttg.memdesc<128x8xi8, #shared1, #smem, mutable>) {
+// expected-error @+1 {{only works with DotOperandEncodingAttr dst encoding}}
+    %1 = amdgpu.local_load_packed_tranposed %arg0 : !ttg.memdesc<8x128xi8, #shared, #smem, mutable> -> tensor<256x128xi32, #blocked>
+    tt.return
+  }
+
+}
@@ -522,4 +522,32 @@ def InThreadTransposeOp : TT_AMDGPU_Op<"in_thread_transpose", [Pure]> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// LocalLoadPackedTransposedOp
+//===----------------------------------------------------------------------===//
+
+def LocalLoadPackedTransposedOp : TT_AMDGPU_Op<"local_load_packed_tranposed"> {
+    let summary = "Load a transposed packed tensor from shared memory into a distributed tensor";
+    let description = [{
+      Requires a M/N packed and M/N contiguous tensor in shared memory and will yield a K packed K contiguous tensor in registers.
+      The packing change will change the shape of the tensor by doubling the M/N dimension and halving the K dimension.
+      For example if A is 16x64 in shared memory, the result of this operation will be 32x32.
+    }];
+  let arguments = (ins
+    Arg<TTG_MemDescType, "", [MemRead<SharedMemory>]>:$src,
+    Optional<TTG_AsyncToken>:$token
+  );
+  let results = (outs TT_Tensor:$result);
+
+  let builders = [
+      OpBuilder<(ins "Type":$retType, "Value":$src),
+      [{
+      build($_builder, $_state, retType, src, /*token=*/static_cast<mlir::Value>(nullptr));
+      }]>];
+
+  // Use qualified() otherwise "!ttg.memdesc<X>" is printed as "<X>".
+  let assemblyFormat = [{$src (`token` $token^)? attr-dict `:` qualified(type($src)) `->` type($result)}];
+  let hasVerifier = 1;
+}
+
 #endif