[Gluon] Expose 3d Dot FMA (#9501)

alefimov-amd · binarman · web-flow · commit 505cd4482212 · 2026-02-25T08:22:44.000-08:00
Enables batched FMA Dots in Gluon.

---------

Co-authored-by: Alexander Efimov &lt;efimov.alexander@gmail.com&gt;
diff --git a/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/FMADotUtility.cpp b/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/FMADotUtility.cpp
@@ -103,7 +103,7 @@ LogicalResult parametricConvertFMADot(DotOp op, DotOp::Adaptor adaptor,
   Value llA = adaptor.getA();
   Value llB = adaptor.getB();
 
-  auto sizePerThread = getContigPerThread(dTensorTy);
+  llvm::SmallVector<unsigned> sizePerThread{dLayout.getSizePerThread()};
   auto numElemsPerThread = product(sizePerThread);
   SmallVector<unsigned> shapePerCTATile;
   for (auto [reg, thread, warp] :
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -1915,13 +1915,44 @@ def kernel(a_ptr, b_ptr, c_ptr, out_ptr):
         ttgl.store(out_ptr + offs, out)
 
     a = torch.rand((B, B), dtype=torch.float32, device="cuda")
-    b = torch.ones((B, B), dtype=torch.float32, device="cuda")
+    b = torch.rand((B, B), dtype=torch.float32, device="cuda")
     c = torch.rand((B, B), dtype=torch.float32, device="cuda")
     out = torch.empty((B, B), dtype=torch.float32, device="cuda")
     kernel[(1, )](a, b, c, out)
     torch.testing.assert_close(out, torch.addmm(c, a, b), atol=1e-2, rtol=1e-2)
 
 
+def test_dot3d_fma():
+    torch.manual_seed(42)
+    B = ttgl.constexpr(32)
+    BATCH = ttgl.constexpr(8)
+    threads_per_warp = ttgl.constexpr(THREADS_PER_WARP)
+
+    @gluon.jit
+    def kernel(a_ptr, b_ptr, c_ptr, out_ptr):
+        layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1, 1], [1, threads_per_warp, 1], [ttgl.num_warps(), 1, 1],
+                                                    [2, 1, 0])
+        lhs_layout: ttgl.constexpr = ttgl.DotOperandLayout(parent=layout, operand_index=0, k_width=0)
+        rhs_layout: ttgl.constexpr = ttgl.DotOperandLayout(parent=layout, operand_index=1, k_width=0)
+
+        offs_b = ttgl.arange(0, BATCH, layout=ttgl.SliceLayout(1, ttgl.SliceLayout(2, layout)))[:, None, None]
+        offs_m = ttgl.arange(0, B, layout=ttgl.SliceLayout(0, ttgl.SliceLayout(2, layout)))[None, :, None]
+        offs_n = ttgl.arange(0, B, layout=ttgl.SliceLayout(0, ttgl.SliceLayout(1, layout)))[None, None, :]
+        offs = offs_b * B * B + offs_m * B + offs_n
+        a = ttgl.convert_layout(ttgl.load(a_ptr + offs), lhs_layout)
+        b = ttgl.convert_layout(ttgl.load(b_ptr + offs), rhs_layout)
+        c = ttgl.load(c_ptr + offs)
+        out = ttgl.dot_fma(a, b, c)
+        ttgl.store(out_ptr + offs, out)
+
+    a = torch.rand((BATCH, B, B), dtype=torch.float32, device="cuda")
+    b = torch.rand((BATCH, B, B), dtype=torch.float32, device="cuda")
+    c = torch.rand((BATCH, B, B), dtype=torch.float32, device="cuda")
+    out = torch.empty((BATCH, B, B), dtype=torch.float32, device="cuda")
+    kernel[(1, )](a, b, c, out)
+    torch.testing.assert_close(out, torch.matmul(a, b) + c, atol=1e-2, rtol=1e-2)
+
+
 @gluon.jit
 def kernel_auto_layout_constant(threads_per_warp: ttgl.constexpr):
     BLOCK: ttgl.constexpr = 16
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -635,11 +635,14 @@ def dot_fma(a, b, acc, _semantic=None):
     assert b.type.layout.parent == mma_layout, "b's parent layout must be the same as acc's layout"
     assert a.type.layout.operand_index == 0, "a's operand index must be 0"
     assert b.type.layout.operand_index == 1, "b's operand index must be 1"
-
-    M, N = acc.shape
-    K = a.shape[1]
-    if M * N * K > 2**19:
-        warnings.warn(f"Large dot FMA instruction size {M}x{N}x{K} may have slow compile times")
+    assert len(acc.shape) == 2 or len(acc.shape) == 3
+    assert len(acc.shape) == len(a.shape) == len(b.shape)
+
+    unified_dot_shape = acc.shape + a.shape[-1:]  # join batch/M/N and K in one list
+    if math.prod(unified_dot_shape) > 2**19:
+        dot_name = "batched dot" if len(acc.shape) == 3 else "dot"
+        shape_str = "x".join([str(x) for x in unified_dot_shape])
+        warnings.warn(f"Large {dot_name} FMA instruction size {shape_str} may have slow compile times")
 
     handle = _semantic.dot(a, b, acc, input_precision=None, max_num_imprecise_acc=None, out_dtype=acc.dtype).handle
     return tensor(handle, acc.type)