[Gluon] Expose DotOperandLayout (#7730)

Mogball · web-flow · commit c2cfe83e6b32 · 2025-08-01T08:44:55.000-07:00
Needed for WGMMA with LHS in registers
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -90,6 +90,7 @@ struct GluonLayouts {
   py::handle BlockedLayout;
   py::handle SliceLayout;
   py::handle DistributedLinearLayout;
+  py::handle DotOperandLayout;
   py::handle NVMMADistributedLayout;
   py::handle NVMMASharedLayout;
   py::handle SwizzledSharedLayout;
@@ -102,6 +103,7 @@ struct GluonLayouts {
     SliceLayout = py::object(layouts.attr("SliceLayout")).release();
     DistributedLinearLayout =
         py::object(layouts.attr("DistributedLinearLayout")).release();
+    DotOperandLayout = py::object(layouts.attr("DotOperandLayout")).release();
     NVMMADistributedLayout =
         py::object(layouts.attr("NVMMADistributedLayout")).release();
     NVMMASharedLayout = py::object(layouts.attr("NVMMASharedLayout")).release();
@@ -155,6 +157,9 @@ py::object layoutToGluon(Attribute layout) {
         ll.getBases().lookup(kReg), ll.getBases().lookup(kLane),
         ll.getBases().lookup(kWarp), ll.getBases().lookup(kBlock),
         toStdVector(ArrayRef(llvm::to_vector(ll.getOutDimSizes()))));
+  } else if (auto dotOp = dyn_cast<ttg::DotOperandEncodingAttr>(layout)) {
+    return layouts.DotOperandLayout(
+        dotOp.getOpIdx(), layoutToGluon(dotOp.getParent()), dotOp.getKWidth());
   } else if (auto mma = dyn_cast<ttg::NvidiaMmaEncodingAttr>(layout)) {
     auto ctaLayout = mma.getCTALayout();
     return layouts.NVMMADistributedLayout(
@@ -259,6 +264,12 @@ void init_gluon_ir(py::module &&m) {
                                         /*requiresSurjective=*/true);
              return ttg::LinearEncodingAttr::get(ctx, ll);
            })
+      .def("get_dot_operand_layout",
+           [](GluonOpBuilder &self, unsigned opIdx, Attribute parent,
+              unsigned kWidth) -> Attribute {
+             return self.getChecked<ttg::DotOperandEncodingAttr>(
+                 self.getContext(), opIdx, parent, kWidth);
+           })
       .def("get_mma_layout",
            [](GluonOpBuilder &self, std::vector<unsigned> &version,
               std::vector<unsigned> &warpsPerCta,
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -119,8 +119,8 @@ def shared_memory_kernel(XBLOCK: ttgl.constexpr, YBLOCK: ttgl.constexpr, layout_
                          layout_b: ttgl.constexpr, smem_layout: ttgl.constexpr):
     unused = ttgl.allocate_shared_memory(ttgl.int32, [XBLOCK, YBLOCK], smem_layout)
     a = ttgl.full([XBLOCK, YBLOCK], 0, ttgl.int32, layout_a)
-    tl.static_assert(a.numel == unused.numel)
-    tl.static_assert(unused.numel == XBLOCK * YBLOCK)
+    ttgl.static_assert(a.numel == unused.numel)
+    ttgl.static_assert(unused.numel == XBLOCK * YBLOCK)
     mem = ttgl.allocate_shared_memory(ttgl.int32, a.shape, smem_layout, a)
     b = mem.load(layout_b)  # noqa: F841
     mem.store(a)
@@ -641,7 +641,7 @@ def async_tma_kernel(input_desc, XBLOCK: ttgl.constexpr):
     mbarrier.init(bar, count=1)
 
     tma.async_copy_global_to_shared(input_desc, [0, 0], bar, smem)
-    tl.static_assert(input_desc.block_type.nbytes == XBLOCK * XBLOCK * 2)
+    ttgl.static_assert(input_desc.block_type.nbytes == XBLOCK * XBLOCK * 2)
     mbarrier.expect(bar, input_desc.block_type.nbytes)
     mbarrier.wait(bar, 0)
 
@@ -941,7 +941,7 @@ def reduce_kernel(out):
     ttgl.static_assert(pairs[0].type.layout == ttgl.SliceLayout(0, layout))
     ttgl.static_assert(pairs[1].type.layout == ttgl.SliceLayout(0, layout))
     result = scalar + s1 + pairs[0] + pairs[1]
-    tl.store(out + ttgl.arange(0, 16, s0.type.layout), result)
+    ttgl.store(out + ttgl.arange(0, 16, s0.type.layout), result)
 
 
 @pytest.mark.parametrize("target", ALL_TARGETS)
@@ -1057,8 +1057,8 @@ def test_elementwise_core():
 
 @gluon.jit
 def linear_layout_kernel():
-    ll: tl.constexpr = ttgl.DistributedLinearLayout(reg_bases=[[1]], lane_bases=[[2], [4], [8], [16], [32]],
-                                                    warp_bases=[[64], [128]], block_bases=[], shape=[256])
+    ll: ttgl.constexpr = ttgl.DistributedLinearLayout(reg_bases=[[1]], lane_bases=[[2], [4], [8], [16], [32]],
+                                                      warp_bases=[[64], [128]], block_bases=[], shape=[256])
     ttgl.arange(0, 256, layout=ll)
 
 
@@ -1077,6 +1077,20 @@ def test_linear_layout(target):
 """)
 
 
+@filecheck_test
+@gluon.jit
+def test_dot_operand_layout():
+    # CHECK: [[NVMMA:#.*]] = #ttg.nvidia_mma
+    # CHECK: test_dot_operand_layout
+    mma_layout: ttgl.constexpr = ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[4, 1],
+                                                             instr_shape=[16, 32, 16])
+    layout: ttgl.constexpr = ttgl.DotOperandLayout(operand_index=0, parent=mma_layout, k_width=2)
+    # CHECK: arith.constant {{.*}} tensor<256x128xf16, #ttg.dot_op<{opIdx = 0, parent = [[NVMMA]], kWidth = 2}>>
+    x = ttgl.full([256, 128], 0.0, ttgl.float16, layout)
+    y = x.sum(axis=1)
+    ttgl.static_assert(y.type.layout.parent == layout)
+
+
 @filecheck_test
 @gluon.jit
 def test_tensor_permute():
@@ -1201,7 +1215,7 @@ def async_copy_kernel(inp, xnumel, XBLOCK: ttgl.constexpr):
     smem = ttgl.allocate_shared_memory(inp.dtype.element_ty, [XBLOCK], ttgl.SwizzledSharedLayout(1, 1, 1, order=[0]))
     block_layout: ttgl.constexpr = ttgl.BlockedLayout([2], [32], [4], [0])
     xindex = ttgl.arange(0, XBLOCK, block_layout)
-    mask = tl.max_constancy(xindex < xnumel, 2)
+    mask = ttgl.max_constancy(xindex < xnumel, 2)
 
     async_copy.async_copy_global_to_shared(smem, inp + xindex)
     async_copy.async_copy_global_to_shared(smem, inp + xindex, mask, cache_modifier=".ca", eviction_policy="evict_last",
diff --git a/python/triton/experimental/gluon/language/_layouts.py b/python/triton/experimental/gluon/language/_layouts.py
@@ -7,6 +7,7 @@
     "BlockedLayout",
     "SliceLayout",
     "DistributedLinearLayout",
+    "DotOperandLayout",
     "NVMMADistributedLayout",
     "NVMMASharedLayout",
     "SwizzledSharedLayout",
@@ -181,6 +182,32 @@ def mangle(self):
         return f"DLL{self.reg_bases}_{self.lane_bases}_{self.warp_bases}_{self.block_bases}_{self.shape}DLL"
 
 
+@dataclass(frozen=True)
+class DotOperandLayout(DistributedLayout):
+    """
+    Represents a layout for a dot operand.
+
+    Args:
+        operand_index (int): 0 for LHS and 1 for RHS of the dot operation.
+        parent (DistributedLayout): The parent layout, representing the MMA.
+        k_width (int): Number of elements per 32-bits.
+    """
+    operand_index: int
+    parent: DistributedLayout
+    k_width: int
+
+    def __post_init__(self):
+        super().__setattr__("operand_index", _unwrap_if_constexpr(self.operand_index))
+        super().__setattr__("parent", _unwrap_if_constexpr(self.parent))
+        super().__setattr__("k_width", _unwrap_if_constexpr(self.k_width))
+
+    def _to_ir(self, builder):
+        return builder.get_dot_operand_layout(self.operand_index, self.parent._to_ir(builder), self.k_width)
+
+    def mangle(self) -> str:
+        return f"DO{self.operand_index}_{self.parent.mangle()}_{self.k_width}DO"
+
+
 @dataclass(frozen=True)
 class NVMMADistributedLayout(DistributedLayout):
     """