intel
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 10 additions & 11 deletions b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎Makefile‎
Lines changed: 2 additions & 1 deletion b/‎Makefile‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 2 additions & 2 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.td‎
Lines changed: 4 additions & 1 deletion b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.td‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 35 additions & 5 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 35 additions & 5 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 31 additions & 103 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 31 additions & 103 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Types.cpp‎
Lines changed: 8 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Types.cpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -18,22 +18,23 @@ jobs:
       matrix:
         runner: ${{ fromJson(inputs.matrix) }}
         include:
-          - image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
+          - image: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0
             runner: ["self-hosted", "gfx90a"]
             # Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
             # container expect it at /github/home/.triton. So map here to make sure visible in docker.
             options: >-
               --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
               --volume /home/runner/.triton:/github/home/.triton
-          - image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
+          - image: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0
             runner: ["amd-gfx942"]
             # We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
             options: >-
               --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
               --env-file /etc/podinfo/gha-gpu-isolation-settings
               --volume /home/runner/.triton:/github/home/.triton
-          - image: rocm/7.0-preview:rocm7.0_preview_ubuntu22.04_llama2_70b_training_mlperf_mi35X_prealpha
+          - image: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0
             runner: ["amd-gfx950"]
+            # We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
             options: >-
               --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
               --env-file /etc/podinfo/gha-gpu-isolation-settings
@@ -83,14 +84,16 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
+      - name: Install dependencies
+        run: apt-get install -y clang lld ccache
       - name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
           du -h -d 1 ~/.triton
 
           mkdir -p ~/.ccache
           du -h -d 1 ~/.ccache
-      - name: Update compiler to clang
+      - name: Update compiler to Clang
         run: |
           export CC=/usr/bin/clang
           export CXX=/usr/bin/clang++
@@ -100,19 +103,15 @@ jobs:
           echo "PATH is '$PATH'"
           pip uninstall -y triton pytorch-triton-rocm
 
-          if [ "${{ matrix.runner[0] }}" != "amd-gfx950" ]; then
-            ccache --zero-stats
-          fi
-
+          ccache --zero-stats
           make dev-install
-      - name: CCache Stats
-        if: ${{ matrix.runner[0] != 'amd-gfx950' }}
+      - name: Print ccache stats
         run: ccache --print-stats
       - name: Run lit tests
         run: make test-lit
       - name: Run C++ unittests
         run: make test-cpp
-      - name: Run python tests on AMD
+      - name: Run Python tests on AMD
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
 
@@ -73,8 +73,9 @@ test-interpret: all
 
 .PHONY: test-proton
 test-proton: all
-	$(PYTEST) --tb=short -s -n 8 third_party/proton/test --ignore=third_party/proton/test/test_override.py
+	$(PYTEST) --tb=short -s -n 8 third_party/proton/test --ignore=third_party/proton/test/test_override.py -k "not test_overhead"
 	$(PYTEST) --tb=short -s third_party/proton/test/test_override.py
+	$(PYTEST) --tb=short -s third_party/proton/test/test_instrumentation.py::test_overhead
 
 .PHONY: test-python
 test-python: test-unit test-regression test-interpret test-proton
 
@@ -303,6 +303,8 @@ SetVector<int> getPartitionIds(Operation *op);
 SmallVector<SetVector<int>, 4> getPartitionOutputs(Operation *op);
 SetVector<int> getPartitionIds(OpOperand *use);
 bool hasPartition(Operation *op);
+bool hasWarpSpecializeTag(Operation *op);
+std::optional<int> getWarpSpecializeTag(Operation *op);
 
 } // namespace mlir::triton::gpu
 
 
@@ -120,6 +120,7 @@ void setPartition(Operation *op, const SetVector<Partition *> &partitions);
 void setPartition(Operation *op, const SetVector<int> &partitionIds);
 void setPartitionOutputs(Operation *op,
                          ArrayRef<SetVector<int>> partitionOutputsIds);
+void setWarpSpecializeTag(Operation *op, int tag);
 
 } // namespace mlir::triton::gpu
 
 
@@ -107,8 +107,8 @@ inline const char *getOpShape(TMemAccessAtom atom) {
   llvm_unreachable("Unknown TMemAccessAtom");
 }
 
-LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom,
-                           bool unpacked);
+LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom, bool unpacked,
+                           bool withWarp);
 
 TMemAllocation getTmemAllocSizes(gpu::MemDescType memDescType);
 
 
@@ -40,14 +40,17 @@ def TTG_TensorMemoryEncodingAttr : AttrDef<TritonNvidiaGPU_Dialect, "TensorMemor
     that is, the stride between two elements in the same row.
     When colStride is 1 the tensor memory is packed. When colStride > 1, the
     tensor memory between elements is undefined.
+    `twoCTAs` indicates that the tensor memory is laid out for twoCTA mode,
+    i.e., `cta_group::2`.
   }];
   let parameters = (
     ins
     "unsigned":$blockM,
     "unsigned":$blockN,
     "unsigned":$colStride,
     DefaultValuedParameter<"unsigned", "1">:$CTASplitM,
-    DefaultValuedParameter<"unsigned", "1">:$CTASplitN
+    DefaultValuedParameter<"unsigned", "1">:$CTASplitN,
+    DefaultValuedParameter<"bool", "false">:$twoCTAs
   );
   let genVerifyDecl = 1;
   let assemblyFormat = "`<` struct(params) `>`";
 
@@ -2806,15 +2806,34 @@ struct TritonGPUInferLayoutInterface
         mlir::dyn_cast<triton::gpu::DotOperandEncodingAttr>(operandEncodingB);
     if (!aEncoding && !bEncoding)
       return mlir::success();
-    auto mmaAEncoding =
-        mlir::dyn_cast_or_null<NvidiaMmaEncodingAttr>(aEncoding.getParent());
-    if (mmaAEncoding && mmaAEncoding.isHopper())
-      return success();
-    // Verify that the encodings are valid.
     if (!aEncoding || !bEncoding)
       return op->emitError("mismatching encoding between A and B operands");
+    // Verify that the encodings are valid.
     if (aEncoding.getKWidth() != bEncoding.getKWidth())
       return op->emitError("mismatching kWidth between A and B operands");
+
+    // Check if we have already selected an MMA version for Nvidia. If so,
+    // validate that the encodings are correct and compatible.
+    auto mmaAEncoding =
+        dyn_cast_or_null<NvidiaMmaEncodingAttr>(aEncoding.getParent());
+    auto mmaBEncoding =
+        dyn_cast_or_null<NvidiaMmaEncodingAttr>(bEncoding.getParent());
+    auto dotOp = cast<DotOp>(op);
+    auto resEnc = dotOp.getResult().getType().getEncoding();
+    auto mmaResEncoding = dyn_cast<NvidiaMmaEncodingAttr>(resEnc);
+    if (mmaAEncoding || mmaBEncoding || mmaResEncoding) {
+      // Check that they are all set and have the same version.
+      if (!mmaAEncoding || !mmaBEncoding || !mmaResEncoding)
+        return op->emitError("mismatching MMA encoding");
+      auto mmaBEncoding = cast<NvidiaMmaEncodingAttr>(bEncoding.getParent());
+      if (mmaAEncoding.getVersionMajor() != mmaBEncoding.getVersionMajor() ||
+          mmaAEncoding.getVersionMajor() != mmaResEncoding.getVersionMajor()) {
+        return op->emitError("mismatched MMA version.");
+      }
+      // Verify that the operands are supported on the selected MMA version.
+      if (!supportMMA(dotOp, mmaResEncoding.getVersionMajor()))
+        return op->emitError("unsupported MMA version");
+    }
     return success();
   }
 
@@ -4032,3 +4051,14 @@ SetVector<int> triton::gpu::getPartitionIds(OpOperand *use) {
 bool triton::gpu::hasPartition(Operation *op) {
   return op && op->hasAttr(kPartitionAttrName);
 }
+
+bool triton::gpu::hasWarpSpecializeTag(Operation *op) {
+  return op && op->hasAttr(kWarpSpecializeTagAttrName);
+}
+
+std::optional<int> triton::gpu::getWarpSpecializeTag(Operation *op) {
+  if (hasWarpSpecializeTag(op)) {
+    return cast<IntegerAttr>(op->getAttr(kWarpSpecializeTagAttrName)).getInt();
+  }
+  return std::nullopt;
+}
@@ -470,93 +470,6 @@ AMDMfmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   return combineCtaCgaWithShape(tileLayout, getCTALayout(), shape);
 }
 
-std::optional<LinearLayout>
-chooseLLDsReadTrLayout(Attribute enc, ArrayRef<int64_t> shape,
-                       int32_t elemBitWidth, unsigned instBitWidth,
-                       unsigned numLanesInShuffleGroup) {
-  using BaseTy = std::vector<std::vector<int32_t>>;
-  // This function will derive the layout for the ds_read_tr instruction
-  // based on the input layout (LL/DotLayout/...)
-  // The ds_read_tr instruction works on instBitWidth per lane and in groups of
-  // numLanesInShuffleGroup lanes.
-
-  // In this example we look at ds_read_b64_tr (instBitWidth = 64) and
-  // numLanesInShuffleGroup = 16 with 64 lanes per warp. Using M-continuous
-  // 16-bit input tensor A as an example. Each lane will load 4 consecutive
-  // elements (64-bit in total) along M. There are 4 consecutive lanes in total
-  // along M. Then the loaded elements are exchanged within the MxK=16x4 "base
-  // unit".
-  //        K0  K1  K2  K3
-  //      +---+---+---+---+
-  //  M0  |   |   |   |   |       M0, K[0-3]:  T0
-  //  M1  | T | T | T | T |       M1, K[0-3]:  T1
-  //  M2  | 0 | 4 | 8 |12 |       M2, K[0-3]:  T2
-  //  M3  |   |   |   |   |       M3, K[0-3]:  T3
-  //      +---+---+---+---+
-  //  M4  |   |   |   |   |       M4, K[0-3]:  T4
-  //  M5  | T | T | T | T |       M5, K[0-3]:  T5
-  //  M6  | 1 | 5 | 9 |13 |       M6, K[0-3]:  T6
-  //  M7  |   |   |   |   |       M7, K[0-3]:  T7
-  //      +---+---+---+---+  ==>
-  //  M8  |   |   |   |   |       M8, K[0-3]:  T8
-  //  M9  | T | T | T | T |       M9, K[0-3]:  T9
-  // M10  | 2 | 6 |10 |14 |      M10, K[0-3]: T10
-  // M11  |   |   |   |   |      M11, K[0-3]: T11
-  //      +---+---+---+---+
-  // M12  |   |   |   |   |      M12, K[0-3]: T12
-  // M13  | T | T | T | T |      M13, K[0-3]: T13
-  // M14  | 3 | 7 |11 |15 |      M14, K[0-3]: T14
-  // M15  |   |   |   |   |      M15, K[0-3]: T15
-  //      +---+---+---+---+
-
-  // Given the layout represented by `enc` and shape, we can derive the layout
-  // that ds_read_b64_tr need to have in order to perform a vectorized load of
-  // the elements. This can be done by rearranging the inner 4x16 element base
-  // unit in the LL by rearranging the first numReg register bases and the
-  // first numLane lane bases.
-  auto rotatePrefixes = [](BaseTy &regBase, std::size_t numReg,
-                           BaseTy &laneBase, std::size_t numLane) {
-    // Concatenate prefixes of the two vectors. Lane first and then regs.
-    // C D E F | A B
-    // Then copy over numReg to the regBase and numLane to laneBase
-    // C D | E F A B
-    BaseTy baseUnit(laneBase.begin(), laneBase.begin() + numLane);
-    llvm::append_range(
-        baseUnit, llvm::make_range(regBase.begin(), regBase.begin() + numReg));
-
-    std::copy(baseUnit.begin(), baseUnit.begin() + numReg, regBase.begin());
-    std::copy(baseUnit.begin() + numReg, baseUnit.end(), laneBase.begin());
-  };
-
-  auto ctx = enc.getContext();
-  assert(elemBitWidth == 8 || elemBitWidth == 16);
-  // Get how many reg bases and tile bases the ds_read_tr tile spans
-  unsigned numRegBases = llvm::Log2_32(instBitWidth / elemBitWidth);
-  unsigned numLaneBases = llvm::Log2_32(numLanesInShuffleGroup);
-
-  auto ldsTransLayout = triton::gpu::toLinearLayout(shape, enc);
-  auto bases = ldsTransLayout.getBases();
-  auto kRegister = S("register");
-  auto kLane = S("lane");
-
-  // Make sure that we have enough register bases to rotate, otherwise we
-  // can't return a valid ds_read_tr layout
-  if (ldsTransLayout.getInDimSizeLog2(kRegister) < numRegBases) {
-    return std::nullopt;
-  }
-  // We should always have enough lanes
-  assert(ldsTransLayout.getInDimSizeLog2(kLane) >= numLaneBases);
-  rotatePrefixes(bases[kRegister], numRegBases, bases[kLane], numLaneBases);
-  // Scale types double the elements for a total of 16 vgpr (still only 16
-  // elements contiguous). Need to adjust the lane basis to reflect that
-  if (elemBitWidth == 8 && numLanesInShuffleGroup == 8) {
-    assert(ldsTransLayout.getInDimSizeLog2(kLane) >= (numLaneBases + 1));
-    std::swap(bases[kLane][numLaneBases - 1], bases[kLane][numLaneBases]);
-  }
-
-  return LinearLayout(bases, ldsTransLayout.getOutDims(), false);
-}
-
 std::optional<LinearLayout>
 chooseDotDsReadTrLayout(DotOperandEncodingAttr dotMfmaLayout,
                         ArrayRef<int64_t> shape, int32_t elemBitWidth,
@@ -1192,20 +1105,39 @@ LinearLayout tensorMemoryToLinearLayout(ArrayRef<int64_t> shape,
         LinearLayout::identity1D(encoding.getCTASplitN(), kCol, dims[1]);
     auto newEncoding = TensorMemoryEncodingAttr::get(
         ctx, encoding.getBlockM(), encoding.getBlockN(),
-        encoding.getColStride(), encoding.getCTASplitM(), 1);
+        encoding.getColStride(), encoding.getCTASplitM(), 1,
+        encoding.getTwoCTAs());
     return tensorMemoryToLinearLayout(
                {shape[0], shape[1] / encoding.getCTASplitN()}, newEncoding) *
            split;
   }
   if (encoding.getCTASplitM() > 1) {
-    auto split =
-        LinearLayout::identity1D(encoding.getCTASplitM(), kCol, dims[0]);
+    auto splitM = encoding.getCTASplitM();
+    auto blockM = encoding.getBlockM();
+    bool isM64TwoCTA = blockM == 64 && encoding.getTwoCTAs();
+    if (isM64TwoCTA) {
+      // blockM == 64 and twoCTAs is laid out as the transpose of 128xblockN
+      // https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-b
+      blockM *= 2;
+      splitM /= 2;
+    }
+    auto split = LinearLayout::identity1D(splitM, kCol, dims[0]);
     auto newEncoding = TensorMemoryEncodingAttr::get(
-        ctx, encoding.getBlockM(), encoding.getBlockN(),
-        encoding.getColStride(), 1, encoding.getCTASplitN());
-    return tensorMemoryToLinearLayout(
-               {shape[0] / encoding.getCTASplitM(), shape[1]}, newEncoding) *
-           split;
+        ctx, blockM, encoding.getBlockN(), encoding.getColStride(), 1,
+        encoding.getCTASplitN(), encoding.getTwoCTAs());
+    auto ret =
+        tensorMemoryToLinearLayout({shape[0] / splitM, shape[1]}, newEncoding) *
+        split;
+    // In this case, we swap the basis of the last row and last column as per
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-bny
+    if (isM64TwoCTA) {
+      auto bases = ret.getBases();
+      auto &rowBases = bases[kRow];
+      auto &colBases = bases[kCol];
+      std::swap(rowBases[rowBases.size() - 1], colBases[colBases.size() - 1]);
+      ret = LinearLayout(bases, ret.getOutDims(), ret.isSurjective());
+    }
+    return ret;
   }
   assert(encoding.getCTASplitM() == 1 && encoding.getCTASplitN() == 1);
 
@@ -1461,14 +1393,10 @@ std::optional<LinearLayout>
 chooseDsReadTrLayout(Attribute enc, ArrayRef<int64_t> shape,
                      int32_t elemBitWidth, unsigned instBitWidth,
                      unsigned numLanesInShuffleGroup) {
-  if (elemBitWidth == 4) {
-    auto dot = cast<DotOperandEncodingAttr>(enc);
-    return chooseDotDsReadTrLayout(dot, shape, elemBitWidth, instBitWidth,
-                                   numLanesInShuffleGroup);
-  } else {
-    return chooseLLDsReadTrLayout(enc, shape, elemBitWidth, instBitWidth,
-                                  numLanesInShuffleGroup);
-  }
+  assert(elemBitWidth == 4);
+  auto dot = cast<DotOperandEncodingAttr>(enc);
+  return chooseDotDsReadTrLayout(dot, shape, elemBitWidth, instBitWidth,
+                                 numLanesInShuffleGroup);
 }
 
 LinearLayout chooseScaledWmmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
 
@@ -143,6 +143,14 @@ LogicalResult MemDescType::verify(function_ref<InFlightDiagnostic()> emitError,
                          << ll.getOutDimSize(dims[0]) << "x"
                          << ll.getOutDimSize(dims[1]);
     }
+    // Note the following holds for both M=64 and M=128 with 2CTA
+    auto nCol = ll.getInDimSize(StringAttr::get(ctx, "col"));
+    if (nCol / (enc.getCTASplitM() * enc.getCTASplitN()) >
+        512 * 32 / bitwidth) {
+      return emitError() << "nCol / (CTASplitM * CTASplitN) must be less than "
+                            "or equal to 512 * 32 / bitwidth but got "
+                         << nCol / (enc.getCTASplitM() * enc.getCTASplitN());
+    }
   } else if (auto enc = dyn_cast<SharedEncodingTrait>(encoding)) {
     if (memorySpace != SharedMemorySpaceAttr::get(ctx)) {
       return emitError()
 
@@ -566,7 +566,7 @@ class BlockedToMMAv5 : public mlir::OpRewritePattern<DotOp> {
     unsigned colStride = 32 / bitwidth;
     Attribute accEncoding = triton::nvidia_gpu::TensorMemoryEncodingAttr::get(
         context, instrShape[0], instrShape[1], colStride, CTASplitNum[0],
-        CTASplitNum[1]);
+        CTASplitNum[1], useTwoCTAs);
     Attribute tensorMemorySpace =
         triton::nvidia_gpu::TensorMemorySpaceAttr::get(context);
     MemDescType accMemDescType =
@@ -847,7 +847,7 @@ class ScaledBlockedToMMAv5
     auto bitwidth = oldRetType.getElementType().getIntOrFloatBitWidth();
     unsigned colStride = 32 / bitwidth;
     Attribute accEncoding = triton::nvidia_gpu::TensorMemoryEncodingAttr::get(
-        context, m, n, colStride, CTASplitNum[0], CTASplitNum[1]);
+        context, m, n, colStride, CTASplitNum[0], CTASplitNum[1], false);
     Attribute tensorMemorySpace =
         triton::nvidia_gpu::TensorMemorySpaceAttr::get(context);
     MemDescType accMemDescType =
Original file line number	Diff line number	Diff line change
`@@ -107,8 +107,8 @@ inline const char *getOpShape(TMemAccessAtom atom) {`
`107`	`107`	`llvm_unreachable("Unknown TMemAccessAtom");`
`108`	`108`	`}`
`109`	`109`
`110`		`-LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom,`
`111`		`- bool unpacked);`
	`110`	`+LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom, bool unpacked,`
	`111`	`+ bool withWarp);`
`112`	`112`
`113`	`113`	`TMemAllocation getTmemAllocSizes(gpu::MemDescType memDescType);`
`114`	`114`