intel
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 0 additions & 9 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 125 additions & 3 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 125 additions & 3 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 6 additions & 12 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 6 additions & 12 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 0 additions & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 96 additions & 7 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 96 additions & 7 deletions
diff --git a/‎python/setup.py‎
Lines changed: 9 additions & 1 deletion b/‎python/setup.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎test/TritonGPU/amd/amd-loop-pipeline-v1.mlir‎
Lines changed: 31 additions & 0 deletions b/‎test/TritonGPU/amd/amd-loop-pipeline-v1.mlir‎
Lines changed: 31 additions & 0 deletions
@@ -25,6 +25,7 @@ env:
   TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
   TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
   TRITON_DISABLE_LINE_INFO: 1
+  PROTON_SKIP_PC_SAMPLING_TEST: 1
 jobs:
   Runner-Preparation:
     runs-on: ubuntu-latest
 
@@ -27,7 +27,7 @@ env:
   TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
   TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
   TRITON_DISABLE_LINE_INFO: 1
-
+  PROTON_SKIP_PC_SAMPLING_TEST: 1
 
 jobs:
   Runner-Preparation:
 
@@ -57,15 +57,6 @@ class TargetInfoBase {
                           unsigned numLaneToReduce,
                           unsigned interleave) const = 0;
 
-  // TODO (Keren): Remove this function once layout conversion using stmatrix is
-  // handled by Linear Layout.
-  virtual bool processReplicaUsingStMatrix(
-      RewriterBase &rewriter, Location loc, Value smemBase,
-      SmallVector<Value> &vals, RankedTensorType srcTy, Type elemTy,
-      ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> origRepShape,
-      ArrayRef<unsigned> outOrd, unsigned accumNumReplicates,
-      int swizzleByteWidth = 0) const = 0;
-
   virtual std::string getMulhiFuncName(Type resultElementTy) const = 0;
   // Emits LLVM code with |rewriter| to print a message following the given
   // format from the device. |formatStrStart| is the pointer to the start of
 
@@ -113,12 +113,134 @@ LinearLayout chooseShemLayoutForRegToRegConversion(
 //   row0  reg[0-1]   reg[4-5]
 //   row8  reg[2-3]   reg[6-7]
 //
+// When `swizzleByteSize` is non-zero, the layout is constructed
+// differently due to leading dimension offset and swizzling.
+// There are two key concepts to understand:
+//
+//   1. Chunks: The leading dimension (i.e., the column dimension) is divided
+//   into chunks, where each chunk's size is determined by `swizzleByteSize`.
+//   2. Swizzling within tiles: Each tile applies a swizzling pattern to its
+//   rows to optimize memory access.
+//
+// - Concept 1: Chunks
+//
+// In the swizzled layout, the leading dimension is strided by
+// `swizzleByteSize`. This introduces the concept of a "chunk", where each chunk
+// spans a certain number of columns.
+//
+// For a tile size of `stmatrix.x4` (16x16 elements), with each element being 16
+// bits (2 bytes), each tile occupies 16 rows and 32 bytes per row (since 16
+// elements * 2 bytes per element = 32 bytes per row).
+//
+// Given a `swizzleByteSize` of 128 bytes, the number of tiles per chunk can be
+// calculated as:
+//
+//   Number of tiles per chunk = swizzleByteSize / (bytes per row) = 128 bytes /
+//   32 bytes = 4 tiles
+//
+// Therefore, each chunk contains 4 tiles horizontally, spanning 64 columns
+// (since each tile is 16 columns):
+//
+//             col0-15    col16-31   col32-47   col48-63
+//   row0-15    tile0      tile1      tile2      tile3
+//
+// For a tensor of size 128x128 elements (#rows x #columns), and each element
+// being 16 bits, the tensor can be divided into multiple chunks both
+// horizontally and vertically.  Chunks are stored in memory in a "column-major"
+// order based on chunks, meaning chunk1's address follows chunk0's.
+//
+// Assuming we have 8 warps, and we assign each warp to process a chunk of 16
+// rows (rows per tile) and 128 columns (the width of two chunks). This results
+// in each warp handling one horizontal slice of the tensor.
+//
+// The overall layout can be visualized as:
+//
+//                        |<- 128 * 128 bytes ->|<- 128 * 128 bytes ->|
+//                              columns 0-63         columns 64-127
+//   warp0 | rows 0-15            chunk0               chunk8
+//   warp1 | rows 16-31           chunk1               chunk9
+//   warp2 | rows 32-47           chunk2               chunk10
+//   warp3 | rows 48-63           chunk3               chunk11
+//   warp4 | rows 64-79           chunk4               chunk12
+//   warp5 | rows 80-95           chunk5               chunk13
+//   warp6 | rows 96-111          chunk6               chunk14
+//   warp7 | rows 112-127         chunk7               chunk15
+//
+// - Concept 2: Swizzling within tiles
+//
+// Within each 16x16 tile, rows are swizzled to optimize memory access patterns.
+// This swizzling is similar to what's defined in `TritonGPUAttrDefs.td`. at the
+// level of each 16x16 tile rather than the entire tensor.
+//
+// Key parameters for swizzling:
+//
+//   - `perPhase`: The number of rows over which to apply a XOR operation at
+//   each phase.
+//   - `maxPhase`: The total number of phases.
+//   - `vectorWidth`: The number of elements per vector, which is 8 in this case
+//   because `stmatrix` stores 8 contiguous elements per thread.
+//
+// The offset of each element within a tile is calculated using the formula:
+//
+//   offset = row * swizzleByteSize + (vectorWidth * ((row / perPhase) %
+//   maxPhase)) * elementSize
+//
+// where `elementSize` is the size of each element in bytes (2 bytes for 16-bit
+// elements).
+//
+// For example, consider the element at index `(row=1, col=0)` in chunk0:
+//
+// Without swizzling:
+//
+//   offset = row * swizzleByteSize + col * elementSize
+//          = 1 * 128 bytes + 0 * 2 bytes
+//          = 128 bytes
+//
+// With swizzling (assuming `perPhase=1`, `maxPhase=8`, `vectorWidth=8`):
+//
+//   offset = row * swizzleByteSize + (vectorWidth * ((row / perPhase) %
+//   maxPhase)) * elementSize
+//          = 1 * 128 bytes + (8 * ((1 / 1) % 8)) * 2 bytes
+//          = 128 bytes + (8 * (1 % 8)) * 2 bytes
+//          = 128 bytes + 8 * 2 bytes
+//          = 128 bytes + 16 bytes
+//          = 144 bytes
+//
+// This swizzling ensures that elements are stored in a way that optimizes for
+// memory bandwidth and reduces bank conflicts.
+//
+// - Verification through Linear Layout
+//
+// We can verify the offsets with the following outputs of the corresponding
+// linear layout, where each element is 16 bits (2 bytes):
+//
+//   - register=1 -> offset=1
+//     register=2 -> offset=2
+//     register=4 -> offset=4
+//     register=8 -> offset=16
+//     register=16 -> offset=32
+//     register=32 -> offset=8192
+//   - lane=1 -> offset=72
+//     lane=2 -> offset=144
+//     lane=4 -> offset=288
+//     lane=8 -> offset=512
+//     lane=16 -> offset=8
+//   - warp=1 -> offset=1024
+//     warp=2 -> offset=2048
+//     warp=4 -> offset=4096
+//
+// For index `(row=1, col=0)`, which corresponds to `reg=0` and `lane=1` in
+// `warp=0`, the offset is calculated as 72 * 2 bytes = 144 bytes.  The result
+// matches our earlier calculation.
+//
 // TODO(Keren): We should replace tensorTy with a LinearLayout and the element
 // bit width of the tensor in the future to support more flexible tensor
 // encodings
-std::optional<LinearLayout> chooseStMatrixLayoutForRegToRegConversion(
-    MLIRContext *ctx, RankedTensorType tensorTy, ArrayRef<unsigned> repShape,
-    ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> order);
+std::optional<LinearLayout>
+chooseStMatrixLayout(MLIRContext *ctx, RankedTensorType tensorTy,
+                     ArrayRef<unsigned> repShape,
+                     ArrayRef<unsigned> paddedRepShape,
+                     ArrayRef<unsigned> order, int swizzleByteSize);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
@@ -215,15 +215,9 @@ struct ConvertLayoutOpConversion
       if (repId != 0) {
         barrier();
       }
-      auto successful = targetInfo.processReplicaUsingStMatrix(
-          rewriter, loc, smemBase, vals, srcTy,
-          getTypeConverter()->convertType(srcTy.getElementType()),
-          paddedRepShape, origRepShape, outOrd, accumNumReplicates);
-      if (!successful) {
-        processReplica(loc, rewriter, /*stNotRd*/ true, srcTy, inNumCTAsEachRep,
-                       multiDimRepId, inVec, paddedRepShape, origRepShape,
-                       outOrd, vals, smemBase);
-      }
+      processReplica(loc, rewriter, /*stNotRd*/ true, srcTy, inNumCTAsEachRep,
+                     multiDimRepId, inVec, paddedRepShape, origRepShape, outOrd,
+                     vals, smemBase);
       barrier();
       processReplica(loc, rewriter, /*stNotRd*/ false, dstTy, outNumCTAsEachRep,
                      multiDimRepId, outVec, paddedRepShape, origRepShape,
@@ -483,9 +477,9 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     // Input dims: [reg, lane, warp]
     // Output dims: [offset, iteration]
     std::optional<LinearLayout> shmemStoreLayout =
-        chooseStMatrixLayoutForRegToRegConversion(
-            ctx, op.getSrc().getType(), scratchConfig.repShape,
-            scratchConfig.paddedRepShape, scratchConfig.order);
+        chooseStMatrixLayout(ctx, op.getSrc().getType(), scratchConfig.repShape,
+                             scratchConfig.paddedRepShape, scratchConfig.order,
+                             /*swizzleByteSize=*/0);
     bool isStMatrix = shmemStoreLayout.has_value();
     if (!isStMatrix) {
       shmemStoreLayout = srcLayout.invertAndCompose(sharedLayout);
 
@@ -116,7 +116,6 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     RankedTensorType dstTy = op.getType();
     Attribute srcLayout = srcTy.getEncoding();
     Attribute dstLayout = dstTy.getEncoding();
-    // TODO: do we need to check if src is shared ?
     if (isa<SharedEncodingAttr>(srcLayout) &&
         isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
             dstLayout)) {
 
@@ -820,8 +820,8 @@ namespace {
 // stmatrix.  These restrictions are retained from legacy code, and we could
 // relax some of them in the future.
 bool canUseStMatrix(RankedTensorType tensorTy, ArrayRef<unsigned> repShape,
-                    ArrayRef<unsigned> paddedRepShape,
-                    ArrayRef<unsigned> order) {
+                    ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> order,
+                    int swizzleByteSize) {
   auto mmaLayout =
       mlir::dyn_cast<NvidiaMmaEncodingAttr>(tensorTy.getEncoding());
   if (!mmaLayout || !mmaLayout.isHopper())
@@ -840,17 +840,87 @@ bool canUseStMatrix(RankedTensorType tensorTy, ArrayRef<unsigned> repShape,
     return false;
   if (paddedRepShape[1] % 8 != 0)
     return false;
+  if (swizzleByteSize != 0 && swizzleByteSize != 32 && swizzleByteSize != 64 &&
+      swizzleByteSize != 128)
+    return false;
   return true;
 }
 
-} // anonymous namespace
+std::optional<LinearLayout> chooseStMatrixLayoutLeadingOffset(
+    MLIRContext *ctx, RankedTensorType tensorTy, ArrayRef<unsigned> repShape,
+    ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> order,
+    int swizzleByteSize) {
+  StringAttr kReg = S("register");
+  StringAttr kLane = S("lane");
+  StringAttr kWarp = S("warp");
+  StringAttr kCol = S("dim1");
+  StringAttr kRow = S("dim0");
+  StringAttr kOffset = S("offset");
+
+  int perPhase;
+  int maxPhase;
+  if (swizzleByteSize == 32) {
+    perPhase = 4;
+    maxPhase = 2;
+  } else if (swizzleByteSize == 64) {
+    perPhase = 2;
+    maxPhase = 4;
+  } else if (swizzleByteSize == 128) {
+    perPhase = 1;
+    maxPhase = 8;
+  } else {
+    llvm::errs() << "Illegal swizzleByteSize: " << swizzleByteSize << "\n";
+    llvm::report_fatal_error("Illegal swizzleByteSize");
+  }
+
+  // stmatrix only supports 16-bit elements, and each vector has 8 elements
+  int elemBitWidth = 16;
+  int vecSize = 8;
+  int numRows = 16;
+  int numCols = 8 * swizzleByteSize / elemBitWidth;
+
+  // Construct a single stmatrix.x4 (16x16) tile
+  std::vector<std::vector<int>> basesReg = {{1, 0}, {2, 0}, {4, 0}};
+  std::vector<std::vector<int>> basesLane;
+  for (int logRow = 0; logRow < llvm::Log2_32(numRows); logRow++) {
+    int row = 1 << logRow;
+    basesLane.push_back({vecSize * ((row / perPhase) % maxPhase), row});
+  }
+  basesLane.push_back({8, 0});
+
+  // Expand the tile's register dimension to fit swizzleByteSize, which is a
+  // "chunk"
+  for (int logChunk = 0; logChunk < llvm::Log2_32(numCols / 16); logChunk++) {
+    int chunk = 1 << logChunk;
+    basesReg.push_back({16 * chunk, 0});
+  }
+
+  // Construct the layout for a single chunk
+  LinearLayout layout =
+      LinearLayout({{kReg, basesReg}, {kLane, basesLane}}, {kCol, kRow});
 
-std::optional<LinearLayout> chooseStMatrixLayoutForRegToRegConversion(
+  // Expand the `warp` dimension according to warpsPerCTA.
+  auto mma = cast<NvidiaMmaEncodingAttr>(tensorTy.getEncoding());
+  layout *=
+      identityND(kWarp, mma.getWarpsPerCTA(), /*order=*/{0, 1}, {kRow, kCol})
+          .transposeOuts(llvm::to_vector(layout.getOutDimNames()));
+
+  // Expand the `register` dimension so the size of columns matches `n`.
+  int n = mma.getInstrShape()[1];
+  int numWarpRows = layout.getOutDimSize(kRow);
+  layout = (layout.reshapeOuts({{kOffset, layout.getTotalOutDimSize()}}) *
+            LinearLayout::identity1D(n / numCols, kReg, kOffset))
+               .reshapeOuts({{kCol, n}, {kRow, numWarpRows}});
+
+  auto ret =
+      combineCtaCgaWithShape(layout, mma.getCTALayout(), tensorTy.getShape());
+  return ret.transposeOuts(llvm::to_vector(layout.getOutDimNames()))
+      .reshapeOuts({{kOffset, ret.getTotalOutDimSize()}, {S("iteration"), 1}});
+}
+
+std::optional<LinearLayout> chooseStMatrixLayoutNoLeadingOffset(
     MLIRContext *ctx, RankedTensorType tensorTy, ArrayRef<unsigned> repShape,
     ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> order) {
-  if (!canUseStMatrix(tensorTy, repShape, paddedRepShape, order))
-    return std::nullopt;
-
   StringAttr kReg = S("register");
   StringAttr kLane = S("lane");
   StringAttr kWarp = S("warp");
@@ -880,4 +950,23 @@ std::optional<LinearLayout> chooseStMatrixLayoutForRegToRegConversion(
           {{S("offset"), ret.getTotalOutDimSize()}, {S("iteration"), 1}});
 }
 
+} // anonymous namespace
+
+std::optional<LinearLayout>
+chooseStMatrixLayout(MLIRContext *ctx, RankedTensorType tensorTy,
+                     ArrayRef<unsigned> repShape,
+                     ArrayRef<unsigned> paddedRepShape,
+                     ArrayRef<unsigned> order, int swizzleByteSize) {
+  if (!canUseStMatrix(tensorTy, repShape, paddedRepShape, order,
+                      swizzleByteSize))
+    return std::nullopt;
+
+  if (swizzleByteSize == 0)
+    return chooseStMatrixLayoutNoLeadingOffset(ctx, tensorTy, repShape,
+                                               paddedRepShape, order);
+  else
+    return chooseStMatrixLayoutLeadingOffset(
+        ctx, tensorTy, repShape, paddedRepShape, order, swizzleByteSize);
+}
+
 } // namespace mlir::triton::gpu
@@ -638,6 +638,14 @@ def get_entry_points():
     return entry_points
 
 
+def get_git_commit_hash(length=8):
+    try:
+        cmd = ['git', 'rev-parse', f'--short={length}', 'HEAD']
+        return "+git{}".format(subprocess.check_output(cmd).strip().decode('utf-8'))
+    except Exception:
+        return ""
+
+
 def get_install_requires():
     install_requires = [
         "packaging",  # used by third_party/intel/backend/compiler.py
@@ -647,7 +655,7 @@ def get_install_requires():
 
 setup(
     name=os.environ.get("TRITON_WHEEL_NAME", "triton"),
-    version="3.0.0" + os.environ.get("TRITON_WHEEL_VERSION_SUFFIX", ""),
+    version="3.0.0" + get_git_commit_hash() + os.environ.get("TRITON_WHEEL_VERSION_SUFFIX", ""),
     author="Philippe Tillet",
     author_email="[email protected]",
     description="A language and compiler for custom Deep Learning operations",
 
@@ -0,0 +1,31 @@
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline | FileCheck %s
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [16, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#loc = loc("/data/users/dberard/triton-env/scripts/matmul.py":6:0)
+#mma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [32, 32], isTransposed = false}>
+module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: tt.func @use_dep_args
+  tt.func @use_dep_args(%a_ptrs: tensor<64x32x!tt.ptr<bf16>, #blocked>, %b_ptrs: tensor<32x64x!tt.ptr<bf16>, #blocked1>, %loop_range: i32) -> (tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>) {
+    %cst = arith.constant dense<32> : tensor<64x32xi32, #blocked>
+    %cst2 = arith.constant dense<2048> : tensor<32x64xi32, #blocked1>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
+    %c0_i32 = arith.constant 0 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %c32_i32 = arith.constant 32 : i32
+    // CHECK: tt.load
+    // CHECK: [[FOR_OUT:%[a-z0-9_]+]]:{{[0-9]+}} = scf.for
+    %for:3 = scf.for %arg6 = %c0_i32 to %loop_range step %c32_i32 iter_args(%arg7 = %cst_0, %arg8 = %a_ptrs, %arg9 = %b_ptrs) -> (tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>)  : i32 {
+      %63 = tt.load %arg8 : tensor<64x32x!tt.ptr<bf16>, #blocked>
+      %64 = tt.load %arg9 : tensor<32x64x!tt.ptr<bf16>, #blocked1>
+      %65 = triton_gpu.convert_layout %63 : tensor<64x32xbf16, #blocked> -> tensor<64x32xbf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+      %66 = triton_gpu.convert_layout %64 : tensor<32x64xbf16, #blocked1> -> tensor<32x64xbf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
+      %67 = tt.dot %65, %66, %arg7 : tensor<64x32xbf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<32x64xbf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<64x64xf32, #mma>
+      %68 = tt.addptr %arg8, %cst : tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<64x32xi32, #blocked>
+      %69 = tt.addptr %arg9, %cst2 : tensor<32x64x!tt.ptr<bf16>, #blocked1>, tensor<32x64xi32, #blocked1>
+      scf.yield %67, %68, %69 : tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>
+    }
+    // CHECK: tt.return {{[^,]+}}, [[FOR_OUT]]#3, [[FOR_OUT]]#4
+    tt.return %for#0, %for#1, %for#2 : tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>
+  }
+}