intel
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/integration-tests-nvidia.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/integration-tests-nvidia.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 4 additions & 0 deletions b/‎Makefile‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 10 additions & 3 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎lib/Tools/LinearLayout.cpp‎
Lines changed: 22 additions & 0 deletions b/‎lib/Tools/LinearLayout.cpp‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 5 additions & 6 deletions b/‎python/src/gluon_ir.cc‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎python/test/gluon/test_core.py‎
Lines changed: 64 additions & 1 deletion b/‎python/test/gluon/test_core.py‎
Lines changed: 64 additions & 1 deletion
@@ -166,6 +166,9 @@ jobs:
           # Reenable test_functional_regression.py once it's fixed
           cd python/test/regression
           python3 -m pytest -s -n 8 ./test_cast_matmul.py
+      - name: Run microbenchmark tests
+        run: |
+          python3 python/test/microbenchmark/launch_overhead.py
       - name: Run Proton tests
         run: |
           unset HIP_VISIBLE_DEVICES
 
@@ -98,6 +98,9 @@ jobs:
         run: make test-interpret
       - name: Run regression tests
         run: make test-regression
+      - name: Run microbenchmark tests
+        # Microbenchmark never fail but running them gives us an easy way to track performance changes.
+        run: make test-microbenchmark
       - name: Run C++ unittests
         run: make test-cpp
       - name: Run Proton tests
 
@@ -60,6 +60,10 @@ test-gluon: all
 test-regression: all
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/regression
 
+.PHONY: test-microbenchmark
+test-microbenchmark: all
+	$(PYTHON) python/test/microbenchmark/launch_overhead.py
+
 .PHONY: test-interpret
 test-interpret: all
 	cd python/test/unit && TRITON_INTERPRET=1 $(PYTEST) -s -n 16 -m interpreter cuda language/test_core.py language/test_standard.py \
 
@@ -56,6 +56,7 @@ std::optional<int> maybeLookupNumWarps(Operation *op);
 // FIXME: Make this API and that of maybeLookupNumWarps consistent!
 // Utility to find the number of threads per warp
 int lookupThreadsPerWarp(OpBuilder &rewriter);
+int lookupNumCTAs(OpBuilder &rewriter);
 
 template <typename Key, typename Value> class Cache {
 public:
 
@@ -432,6 +432,10 @@ class LinearLayout {
     return isSurjective() && getTotalInDimSize() == getTotalOutDimSize();
   }
 
+  // Remove a 1-sized dimension from the layout.
+  [[nodiscard]] LinearLayout unsqueezeIn(StringAttr dim) const;
+  [[nodiscard]] LinearLayout unsqueezeOut(StringAttr dim) const;
+
   const BasesT &getBases() const { return bases; }
 
   // Get the pos'th basis vector for the inDim -> outDim mapping.
 
@@ -3417,13 +3417,20 @@ int triton::gpu::lookupNumWarps(Operation *op) {
 
 int triton::gpu::lookupThreadsPerWarp(OpBuilder &rewriter) {
   assert(rewriter.getInsertionBlock() && "expected an insertion point");
-  Operation *op = rewriter.getInsertionBlock()->getParentOp();
-  while (op && !isa<ModuleOp>(op))
-    op = op->getParentOp();
+  Operation *op =
+      rewriter.getInsertionBlock()->getParentOp()->getParentOfType<ModuleOp>();
   assert(op && "cannot create thread ID outside of module");
   return triton::gpu::TritonGPUDialect::getThreadsPerWarp(cast<ModuleOp>(op));
 }
 
+int triton::gpu::lookupNumCTAs(OpBuilder &rewriter) {
+  assert(rewriter.getInsertionBlock() && "expected an insertion point");
+  Operation *op =
+      rewriter.getInsertionBlock()->getParentOp()->getParentOfType<ModuleOp>();
+  assert(op && "cannot create thread ID outside of module");
+  return triton::gpu::TritonGPUDialect::getNumCTAs(cast<ModuleOp>(op));
+}
+
 bool triton::gpu::areLayoutsEquivalent(ArrayRef<int64_t> shape,
                                        DistributedEncodingTrait lhs,
                                        DistributedEncodingTrait rhs) {
 
@@ -1129,6 +1129,28 @@ LinearLayout LinearLayout::pseudoinvert() const {
   return identity.invertAndCompose(*this);
 }
 
+LinearLayout LinearLayout::unsqueezeIn(StringAttr dim) const {
+  assert(getInDimSize(dim) == 1);
+  SmallVector<std::pair<StringAttr, int32_t>> newInDims;
+  for (auto inDim : getInDimNames()) {
+    if (inDim != dim) {
+      newInDims.push_back({inDim, getInDimSize(inDim)});
+    }
+  }
+  return reshapeIns(newInDims);
+}
+
+LinearLayout LinearLayout::unsqueezeOut(StringAttr dim) const {
+  assert(getOutDimSize(dim) == 1);
+  SmallVector<std::pair<StringAttr, int32_t>> newOutDims;
+  for (auto [outDim, outDimSize] : getOutDims()) {
+    if (outDim != dim) {
+      newOutDims.push_back({outDim, outDimSize});
+    }
+  }
+  return LinearLayout(bases, newOutDims, isSurjective());
+}
+
 llvm::MapVector<StringAttr, int32_t>
 LinearLayout::getFreeVariableMasks() const {
   std::unique_ptr<uint64_t[]> mat = getMatrix(*this);
 
@@ -217,8 +217,8 @@ py::object layoutToGluon(Attribute layout) {
 
     return layouts.AMDMFMALayout(
         amdMfma.getVersion(), instrShape, amdMfma.getIsTransposed(),
-        toStdVector(amdMfma.getWarpsPerCTA()),
-        toStdVector(amdMfma.getTilesPerWarp()), layouts.GluonDType(typeName),
+        toStdVector(amdMfma.getWarpsPerCTA()), layouts.GluonDType(typeName),
+        toStdVector(amdMfma.getTilesPerWarp()),
         toStdVector(ctaLayout.getCTAsPerCGA()),
         toStdVector(ctaLayout.getCTASplitNum()),
         toStdVector(ctaLayout.getCTAOrder()));
@@ -325,13 +325,12 @@ void init_gluon_ir(py::module &&m) {
            })
       .def("get_amd_mfma_layout",
            [](GluonOpBuilder &self, unsigned version,
+              std::vector<unsigned> &instrShape, bool transposed,
+              std::vector<unsigned> &warpsPerCta, mlir::Type elemType,
               std::vector<unsigned> &tilesPerWarp,
-              std::vector<unsigned> &warpsPerCta,
               std::vector<unsigned> &ctasPerCga,
               std::vector<unsigned> &ctaSplitNum,
-              std::vector<unsigned> &ctaOrder,
-              std::vector<unsigned> &instrShape, bool transposed,
-              mlir::Type elemType) -> Attribute {
+              std::vector<unsigned> &ctaOrder) -> Attribute {
              auto ctx = self.getContext();
              auto ctaLayout = self.getChecked<ttg::CTALayoutAttr>(
                  ctx, ctasPerCga, ctaSplitNum, ctaOrder);
 
@@ -1,7 +1,7 @@
 import torch
 import pytest
 
-from triton._internal_testing import is_cuda, is_ampere_or_newer, is_hopper_or_newer, is_hopper
+from triton._internal_testing import is_cuda, is_ampere_or_newer, is_hip_cdna3, is_hip_cdna4, is_hopper_or_newer, is_hopper
 from triton.experimental import gluon
 from triton.experimental.gluon import language as ttgl
 from triton.experimental.gluon.language.nvidia.ampere import async_copy, mbarrier
@@ -143,3 +143,66 @@ def test_warpgroup_mma(ASYNC):
     ref = torch.matmul(a, b)
 
     torch.testing.assert_close(out, ref, atol=1e-3, rtol=1e-1)
+
+
+@pytest.mark.parametrize("M, N, K", [(32, 32, 16), (16, 16, 32)])
+@pytest.mark.parametrize("in_dtype", ['float16', 'bfloat16'])
+@pytest.mark.parametrize("num_warps", [4, 8])
+@pytest.mark.parametrize("cdna_version", [3, 4])
+def test_amd_mfma(M, N, K, in_dtype, num_warps, cdna_version):
+
+    @gluon.jit
+    def kernel(a_ptr, b_ptr, c_ptr, stride_am, stride_ak,  #
+               stride_bk, stride_bn,  #
+               stride_cm, stride_cn, BLOCK_SIZE_M: ttgl.constexpr, BLOCK_SIZE_N: ttgl.constexpr,
+               BLOCK_SIZE_K: ttgl.constexpr, blocked: ttgl.constexpr, mfma_layout: ttgl.constexpr):
+        dot_a_layout: ttgl.constexpr = ttgl.DotOperandLayout(operand_index=0, parent=mfma_layout, k_width=8)
+        dot_b_layout: ttgl.constexpr = ttgl.DotOperandLayout(operand_index=1, parent=mfma_layout, k_width=8)
+
+        offs_am = ttgl.arange(0, BLOCK_SIZE_M, layout=ttgl.SliceLayout(1, blocked))
+        offs_bn = ttgl.arange(0, BLOCK_SIZE_N, layout=ttgl.SliceLayout(0, blocked))
+
+        offs_ak = ttgl.arange(0, BLOCK_SIZE_K, layout=ttgl.SliceLayout(0, blocked))
+        offs_bk = ttgl.arange(0, BLOCK_SIZE_K, layout=ttgl.SliceLayout(1, blocked))
+        offs_a = offs_am[:, None] * stride_am + offs_ak[None, :] * stride_ak
+        offs_b = offs_bk[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+
+        a = ttgl.amd.cdna3.buffer_load(ptr=a_ptr, offsets=offs_a)
+        b = ttgl.amd.cdna3.buffer_load(ptr=b_ptr, offsets=offs_b)
+        a1 = ttgl.convert_layout(a, layout=dot_a_layout)
+        b1 = ttgl.convert_layout(b, layout=dot_b_layout)
+        acc = ttgl.zeros([BLOCK_SIZE_M, BLOCK_SIZE_N], ttgl.float32, mfma_layout)
+        c = ttgl.amd.cdna3.mfma(a1, b1, acc)
+        c = ttgl.convert_layout(c, layout=blocked)
+        c = c.to(a_ptr.dtype.element_ty)
+
+        offs_cm = ttgl.arange(0, BLOCK_SIZE_M, layout=ttgl.SliceLayout(1, blocked))
+        offs_cn = ttgl.arange(0, BLOCK_SIZE_N, layout=ttgl.SliceLayout(0, blocked))
+        offs_c = offs_cm[:, None] * stride_cm + offs_cn[None, :] * stride_cn
+        ttgl.amd.cdna3.buffer_store(stored_value=c, ptr=c_ptr, offsets=offs_c)
+
+    if not is_hip_cdna4() and not is_hip_cdna3():
+        pytest.skip("mfma quires target to be CDNA3 or CDNA4")
+
+    if is_hip_cdna3() and cdna_version != 3:
+        pytest.skip("On CDNA3 target, skip if mfma version is not 3")
+
+    if is_hip_cdna4() and cdna_version != 4:
+        pytest.skip("On CDNA4 target, skip if mfma version is not 4")
+
+    elem_type = torch.float16 if in_dtype == 'float16' else torch.bfloat16
+    a = torch.randn((M, K), device='cuda', dtype=elem_type) - 0.5
+    b = torch.randn((K, N), device='cuda', dtype=elem_type) - 0.5
+    c = torch.empty((M, N), device=a.device, dtype=elem_type)
+    nonkdim: ttgl.constexpr = 32
+    blocked: ttgl.constexpr = ttgl.BlockedLayout(size_per_thread=[4, 4], threads_per_warp=[4, 16],
+                                                 warps_per_cta=[num_warps, 1], order=[1, 0])
+    mfma_layout: ttgl.constexpr = ttgl.amd.AMDMFMALayout(version=cdna_version, instr_shape=[nonkdim, nonkdim],
+                                                         transposed=True, warps_per_cta=[num_warps, 1])
+
+    kernel[1, 1](a, b, c, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1), BLOCK_SIZE_M=M,
+                 BLOCK_SIZE_N=N, BLOCK_SIZE_K=K, blocked=blocked, mfma_layout=mfma_layout, num_warps=num_warps)
+
+    ref = torch.matmul(a, b)
+    triton_output = c
+    torch.testing.assert_close(ref, triton_output)
Original file line number	Diff line number	Diff line change
`@@ -432,6 +432,10 @@ class LinearLayout {`
`432`	`432`	`return isSurjective() && getTotalInDimSize() == getTotalOutDimSize();`
`433`	`433`	`}`
`434`	`434`
	`435`	`+ // Remove a 1-sized dimension from the layout.`
	`436`	`+ [[nodiscard]] LinearLayout unsqueezeIn(StringAttr dim) const;`
	`437`	`+ [[nodiscard]] LinearLayout unsqueezeOut(StringAttr dim) const;`
	`438`	`+`
`435`	`439`	`const BasesT &getBases() const { return bases; }`
`436`	`440`
`437`	`441`	`// Get the pos'th basis vector for the inDim -> outDim mapping.`