intel
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 5 additions & 12 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 5 additions & 14 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 5 additions & 14 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎cmake/AddTritonUnitTest.cmake‎
Lines changed: 3 additions & 0 deletions b/‎cmake/AddTritonUnitTest.cmake‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 30 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 0 additions & 30 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 0 additions & 30 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 73 additions & 44 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 73 additions & 44 deletions
diff --git a/‎python/tutorials/02-fused-softmax.py‎
Lines changed: 6 additions & 8 deletions b/‎python/tutorials/02-fused-softmax.py‎
Lines changed: 6 additions & 8 deletions
@@ -327,7 +327,7 @@ jobs:
         runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-HIP)}}
     name: Integration-Tests (${{matrix.runner[1] == 'gfx90a' && 'mi210' || 'mi300x'}})
     container:
-      image: rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.4
+      image: rocmshared/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
       options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
     steps:
       - name: Checkout
@@ -396,22 +396,15 @@ jobs:
 
           mkdir -p ~/.ccache
           du -h -d 1 ~/.ccache
-      - name: Update PATH
-        run: |
-          echo "/opt/rocm/llvm/bin" >> $GITHUB_PATH
-      - name: Install pip dependencies
-        run: |
-          python3 -m pip install --upgrade pip
-          python3 -m pip install lit
-      - name: Install apt dependencies
+      - name: Update compiler to clang
         run: |
-          apt update
-          apt install ccache
+          export CC=/usr/bin/clang
+          export CXX=/usr/bin/clang++
       - name: Install Triton
         id: amd-install-triton
         run: |
           echo "PATH is '$PATH'"
-          pip uninstall -y triton
+          pip uninstall -y triton pytorch-triton-rocm
           cd python
           ccache --zero-stats
           pip install -v -e '.[tests]'
 
@@ -374,7 +374,7 @@ jobs:
     name: Integration-Tests (${{matrix.runner[1] == 'gfx90a' && 'mi210' || 'mi300x'}})
 
     container:
-      image: rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.4
+      image: rocmshared/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
       options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
 
     steps:
@@ -388,25 +388,16 @@ jobs:
       - *restore-build-artifacts-step
       - *inspect-cache-directories-step
 
-      - name: Update PATH
-        run: |
-          echo "/opt/rocm/llvm/bin" >> $GITHUB_PATH
-
-      - name: Install pip dependencies
-        run: |
-          python3 -m pip install --upgrade pip
-          python3 -m pip install lit
-
-      - name: Install apt dependencies
+      - name: Update compiler to clang
         run: |
-          apt update
-          apt install ccache
+          export CC=/usr/bin/clang
+          export CXX=/usr/bin/clang++
 
       - name: Install Triton
         id: amd-install-triton
         run: |
           echo "PATH is '$PATH'"
-          pip uninstall -y triton
+          pip uninstall -y triton pytorch-triton-rocm
           cd python
           ccache --zero-stats
           pip install -v -e '.[tests]'
 
@@ -84,6 +84,9 @@ if(NOT WIN32)
 endif()
 
 if(TRITON_BUILD_UT)
+  # This is an aggregate target for all unit tests.
+  add_custom_target(TritonUnitTests)
+  set_target_properties(TritonUnitTests PROPERTIES FOLDER "Triton/Tests")
   include(AddTritonUnitTest)
 endif()
 
@@ -355,4 +358,10 @@ add_subdirectory(test)
 
 if(TRITON_BUILD_UT)
   add_subdirectory(unittest)
+  # This target runs all the unit tests.
+  add_custom_target(check-triton-unit-tests
+    COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
+    DEPENDS TritonUnitTests
+    USES_TERMINAL
+  )
 endif()
@@ -36,4 +36,7 @@ function(add_triton_ut)
   # laptop.  I think the issue may be that the very first time you run a program
   # it's a bit slow.
   gtest_discover_tests(${__NAME} DISCOVERY_TIMEOUT 60)
+
+  # Add the unit test to the top-level unit test target.
+  add_dependencies(TritonUnitTests ${__NAME})
 endfunction()
@@ -214,6 +214,10 @@ LinearLayout ensureLayoutNotSmallerThan(
     const LinearLayout &layout,
     const llvm::SmallDenseMap<StringAttr, int64_t> &shape);
 
+SmallVector<StringAttr> standardOutDimNames(MLIRContext *ctx, int rank);
+LinearLayout identityStandardND(StringAttr inDimName, ArrayRef<unsigned> shape,
+                                ArrayRef<unsigned> order);
+
 // Dump information about which threads/registers contain each of the tensor
 // elements.
 void dumpLayout(RankedTensorType tensorType);
 
@@ -91,7 +91,7 @@ void decomposeBlockedToDotLayoutConversion(ModuleOp module) {
         dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
     if (srcBlocked && dstDotOp) {
       auto dotParent = dyn_cast<NvidiaMmaEncodingAttr>(dstDotOp.getParent());
-      if (dotParent && dotParent.isAmpere()) {
+      if (dotParent) {
         return;
       }
       Attribute sharedMemorySpace =
 
@@ -658,6 +658,36 @@ LinearLayout ensureLayoutNotSmallerThan(
   return ret;
 }
 
+// Returns ["dim0", "dim1", ..., "dim<rank-1>"].
+SmallVector<StringAttr> standardOutDimNames(MLIRContext *ctx, int rank) {
+  SmallVector<StringAttr> ret;
+  for (int i = 0; i < rank; i++) {
+    ret.push_back(StringAttr::get(ctx, "dim" + llvm::Twine(i)));
+  }
+  return ret;
+}
+
+// Returns a 1D -> ND layout into [dim0, dim1, ...] that's equivalent to
+// creating a 1D -> 1D mapping of size product(shape) and then reshaping to
+// permute(shape, order).
+LinearLayout identityStandardND(StringAttr inDimName, ArrayRef<unsigned> shape,
+                                ArrayRef<unsigned> order) {
+  assert(shape.size() == order.size());
+  MLIRContext *ctx = inDimName.getContext();
+  auto rank = shape.size();
+
+  // The order in triton is written wrt. [dim0, dim1, ...].
+  SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
+
+  LinearLayout ret = LinearLayout::empty();
+  for (int i = 0; i < shape.size(); i++) {
+    // Start with the most-minor dimension, which is order[0].
+    int dim = order[i];
+    ret *= LinearLayout::identity1D(shape[dim], inDimName, outDimNames[dim]);
+  }
+  return ret;
+}
+
 } // namespace gpu
 } // namespace triton
 } // namespace mlir
 
@@ -33,15 +33,6 @@ namespace {
 
 #define S(v) StringAttr::get(ctx, (v))
 
-// Returns ["dim0", "dim1", ..., "dim<rank-1>"].
-SmallVector<StringAttr> standardOutDimNames(MLIRContext *ctx, int rank) {
-  SmallVector<StringAttr> ret;
-  for (int i = 0; i < rank; i++) {
-    ret.push_back(S("dim" + llvm::Twine(i)));
-  }
-  return ret;
-}
-
 // TODO Have order be a mandatory argument of standardOutDimNames.
 SmallVector<StringAttr> permuteDimNames(const SmallVector<StringAttr> &names,
                                         const SmallVector<unsigned> &order) {
@@ -53,27 +44,6 @@ SmallVector<StringAttr> permuteDimNames(const SmallVector<StringAttr> &names,
   return ret;
 }
 
-// Returns a 1D -> ND layout into [dim0, dim1, ...] that's equivalent to
-// creating a 1D -> 1D mapping of size product(shape) and then reshaping to
-// permute(shape, order).
-LinearLayout identityStandardND(StringAttr inDimName, ArrayRef<unsigned> shape,
-                                ArrayRef<unsigned> order) {
-  assert(shape.size() == order.size());
-  MLIRContext *ctx = inDimName.getContext();
-  auto rank = shape.size();
-
-  // The order in triton is written wrt. [dim0, dim1, ...].
-  SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
-
-  LinearLayout ret = LinearLayout::empty();
-  for (int i = 0; i < shape.size(); i++) {
-    // Start with the most-minor dimension, which is order[0].
-    int dim = order[i];
-    ret *= LinearLayout::identity1D(shape[dim], inDimName, outDimNames[dim]);
-  }
-  return ret;
-}
-
 // Make a LinearLayout that maps a block-id to an N-dimensional index.
 //
 // The tensor is split up into CTAsPerCGA pieces, which are distributed among
 
@@ -12,6 +12,7 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+#include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 
@@ -394,6 +395,10 @@ class DecomposeScaledBlocked
     auto aType = scaledDotOp.getLhsType();
     auto bType = scaledDotOp.getRhsType();
 
+    auto rank = oldRetType.getShape().size();
+    if (rank != 2)
+      return rewriter.notifyMatchFailure(scaledDotOp, "NYI: rank==3");
+
     assert((aType == ScaleDotElemType::E4M3 ||
             aType == ScaleDotElemType::E5M2 ||
             aType == ScaleDotElemType::E2M1) &&
@@ -430,71 +435,95 @@ class DecomposeScaledBlocked
     // `bases[warps] = {(0, 0), (0, 0), ...}`
 
     auto newAEncoding = DotOperandEncodingAttr::get(ctx, 0, mmaEnc, aKWidth);
-    auto rank = mmaEnc.getInstrShape().size();
+
     // MMAv3 uses the first dimension for the M dimension, while MMAv2 uses the
     // penultimate (ugh)
-    auto instrShapeM = mmaEnc.getInstrShape()[versionMajor == 3 ? 0 : rank - 2];
+    auto instrShapeM =
+        mmaEnc.getInstrShape()[versionMajor == 3
+                                   ? 0
+                                   : mmaEnc.getInstrShape().size() - 2];
     auto warpSize = getWarpSize(newAEncoding);
     assert(instrShapeM <= warpSize);
     // Necessary choice to leave all the scales of the tile in that given warp
     auto threadsPerWarp =
         SmallVector<unsigned>{instrShapeM, warpSize / instrShapeM};
 
-    assert(versionMajor == 2 &&
-           "NYI: MMAv3. Need to rethink the scale layout otherwise");
-
-    // Copy the bases
-
+    // This has to align with the order in UpcastMXFPOp
+    auto order = getMatrixOrder(rank, /*rowMajor=*/true);
     Attribute newScaleEncoding = triton::gpu::BlockedEncodingAttr::get(
-        ctx, {1, 1}, threadsPerWarp, newAEncoding.getWarpsPerCTA(),
-        newAEncoding.getCTAOrder(), mmaEnc.getCTALayout());
+        ctx, {1, 1}, threadsPerWarp, newAEncoding.getWarpsPerCTA(), order,
+        mmaEnc.getCTALayout());
 
+    // Lezcano: In the future we could just use the LLs unconditionally
+    // Not doing it now as they are not as performant as Blocked encoding at
+    // times E.g., we bail on them in the backwardMaterialization pass
     auto dotBroadcastsWarpLevel = mmaEnc.getWarpsPerCTA()[1] != 1;
     if (dotBroadcastsWarpLevel) {
-      // If mma has warpsPerCTA == {2, 2}, then newAEncoding has
-      // warpsPerCTA == {2, 1}. In this case, we need to broadcast the warps
-      // on the second dimension as per
-      // A: 0 1 | 0 1
-      //    - - | - -
-      //    2 3 | 2 3
-      // This broadcasting is not representable by standard blocked encodings,
-      // so we need to use linear layouts.
-      // This broadcasting is implemented in ampereDotToLinearLayout
-      auto blocked = cast<BlockedEncodingAttr>(newScaleEncoding);
-      auto blockedLL = *blocked.toLinearLayout(a.getType().getShape());
-      LinearLayout::BasesT scaleBases = blockedLL.getBases();
-      auto nBases = llvm::Log2_32(mmaEnc.getWarpsPerCTA()[1]);
-      auto &warps = scaleBases[StringAttr::get(ctx, "warp")];
-      // Prepend the vector of zeros to the warpBases
-      warps.insert(warps.begin(), nBases, std::vector<int32_t>(rank, 0));
-      auto outDims = llvm::to_vector(blockedLL.getOutDimNames());
-      auto newLL = LinearLayout(scaleBases, outDims);
-      auto llEncoding = LinearEncodingAttr::get(ctx, std::move(newLL));
-      // Adjust the shape of the layout to match the scale operand
-      auto scaleShape = scale.getType().getShape();
-      newScaleEncoding =
-          LinearEncodingAttr::get(ctx, *llEncoding.toLinearLayout(scaleShape));
+      auto kRegister = StringAttr::get(ctx, "register");
+      auto regs = identityStandardND(kRegister, {1, 1}, order);
+      auto lanes =
+          identityStandardND(StringAttr::get(ctx, "lane"), {16, 2}, order);
+
+      // Extract warp layout from dotAEncoding
+      // In the future we'll have some nice division utils, but until then...
+      auto dotLL = *newAEncoding.toLinearLayout(a.getType().getShape());
+      LinearLayout::BasesT scaleBases = dotLL.getBases();
+      auto kWarp = StringAttr::get(ctx, "warp");
+      auto &warpBases = scaleBases[kWarp];
+      // The tile shape was [16, 2 * 4 * kWidth] with broadcasting in K
+      // We divide the M dimension by 16
+      auto div = 16;
+      for (auto &warpBase : warpBases) {
+        if (warpBase[rank - 2] != 0) {
+          assert(warpBase[rank - 2] % div == 0);
+          warpBase[rank - 2] /= div;
+        }
+      }
+
+      LinearLayout::BasesT warpBlockBases;
+      auto standardOutDims = llvm::to_vector(dotLL.getOutDimNames());
+      warpBlockBases[kWarp] = warpBases;
+      auto kBlock = StringAttr::get(ctx, "block");
+      assert(scaleBases[kBlock].empty() && "NYI: CGAs");
+      warpBlockBases[kBlock] = {};
+      auto warpBlock = LinearLayout(std::move(warpBlockBases), standardOutDims);
+
+      auto newLL =
+          (regs * lanes) *
+          warpBlock.transposeOuts(llvm::to_vector(lanes.getOutDimNames()));
+      auto shape = scale.getType().getShape();
+
+      // Broadcast to the correct shape Equivalent to
+      // newLL = ensureLayoutNotSmallerThan(newLL.transposeOuts(getRepOrder),
+      // shape);
+      for (auto d : newAEncoding.getRepOrder()) {
+        auto outDim = standardOutDims[d];
+        auto dimSize = newLL.getOutDimSize(outDim);
+        newLL *=
+            LinearLayout::identity1D(shape[d] / dimSize, kRegister, outDim);
+      }
+      newLL = newLL.transposeOuts(standardOutDims);
+      newScaleEncoding = LinearEncodingAttr::get(ctx, std::move(newLL));
     }
 
     a = createArg(rewriter, a, 0, aType, newAEncoding, scale, newScaleEncoding);
 
-    // Upcast B operand
-    assert(bType != ScaleDotElemType::E2M1 && "NYI: rhs scale for fp4");
-    auto newBEncoding = DotOperandEncodingAttr::get(ctx, 1, mmaEnc, bKWidth);
-    b = createArg(rewriter, b, 1, bType, newBEncoding,
-                  /*scale=*/std::nullopt, /*scaleEncoding=*/std::nullopt);
     Operation *newDot = nullptr;
     if (versionMajor == 2) {
+      // Upcast B operand
+      assert(bType != ScaleDotElemType::E2M1 && "NYI: rhs scale for fp4");
+      auto newBEncoding = DotOperandEncodingAttr::get(ctx, 1, mmaEnc, bKWidth);
+      b = createArg(rewriter, b, 1, bType, newBEncoding,
+                    /*scale=*/std::nullopt, /*scaleEncoding=*/std::nullopt);
       newDot = rewriter.create<DotOp>(scaledDotOp.getLoc(), newRetType, a, b,
                                       newAcc);
     } else {
       assert(versionMajor == 3);
       // At the time of this writing, this is always true
       auto allowTranspose = b.getType().getElementType().isBF16();
-      b = cast<TypedValue<RankedTensorType>>(
-          getSharedMemoryMMAOperand(b, rewriter, 1, allowTranspose));
+      auto bShmem = getSharedMemoryMMAOperand(b, rewriter, 1, allowTranspose);
       newDot = rewriter.create<triton::nvidia_gpu::WarpGroupDotOp>(
-          scaledDotOp.getLoc(), newRetType, a, b, newAcc, nullptr);
+          scaledDotOp.getLoc(), newRetType, a, bShmem, newAcc, nullptr);
     }
 
     // convert dot instruction
@@ -578,11 +607,11 @@ class DecomposeScaledBlocked
     auto dotOp = rewriter.create<DotOp>(
         scaledDotOp.getLoc(), scaledDotOp.getType(), a, b, scaledDotOp.getC());
 
-    // Waiting for https://github.com/triton-lang/triton/pull/5003 to land
-    // cf.
-    // https://github.com/triton-lang/triton/pull/5003#issuecomment-2445091746
-    // int versionMajor = getMMAVersionSafe(computeCapability, dotOp);
     int versionMajor = 2;
+    // We just support bf16 for MMAv3 on the rhs
+    if (bType == ScaleDotElemType::BF16) {
+      versionMajor = getMMAVersionSafe(computeCapability, dotOp);
+    }
     int versionMinor = computeCapability == 75 ? 1 : 0;
 
     RankedTensorType oldRetType = dotOp.getType();
 
@@ -158,14 +158,12 @@ def allocated_slm_size(size_smem):
     y = torch.empty_like(x)
 
     # pre-compile kernel to get register usage and compute thread occupancy.
-    kernel, num_programs = kernels.get(BLOCK_SIZE, (None, 0))
-    if kernel is None:
-        kernel = softmax_kernel.warmup(y, x, x.stride(0), y.stride(0), n_rows, n_cols, num_warps=num_warps,
-                                       threads_per_warp=WARP_SIZE, BLOCK_SIZE=BLOCK_SIZE, grid=(1, ))
-        kernel._init_handles()
-        size_smem = kernel.metadata.shared
-        num_programs = occupancy(num_warps, size_smem)
-        kernels[BLOCK_SIZE] = (kernel, num_programs)
+    kernel = softmax_kernel.warmup(y, x, x.stride(0), y.stride(0), n_rows, n_cols, num_warps=num_warps,
+                                   threads_per_warp=WARP_SIZE, BLOCK_SIZE=BLOCK_SIZE, grid=(1, ))
+    kernel._init_handles()
+    size_smem = kernel.metadata.shared
+    num_programs = occupancy(num_warps, size_smem)
+    kernels[BLOCK_SIZE] = (kernel, num_programs)
 
     # We will *not* launch a persistent kernel if the number of rows is lower (not needed) or that would imply each
     # program would need to process more than 2 rows. Persistent kernels save thread dispatch overhead, but cannot
Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ void decomposeBlockedToDotLayoutConversion(ModuleOp module) {`
`91`	`91`	`dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());`
`92`	`92`	`if (srcBlocked && dstDotOp) {`
`93`	`93`	`auto dotParent = dyn_cast<NvidiaMmaEncodingAttr>(dstDotOp.getParent());`
`94`		`- if (dotParent && dotParent.isAmpere()) {`
	`94`	`+ if (dotParent) {`
`95`	`95`	`return;`
`96`	`96`	`}`
`97`	`97`	`Attribute sharedMemorySpace =`