[BACKEND] Throw an error instead of miscompiling very large tcgen05.mma along N (#8915)

lezcano · web-flow · commit 1d8e147c0d66 · 2025-12-05T20:59:17.000Z
See the comments in the PR for the full description of the issue.
Now the skips in `test_core.py` skip exactly the test that would fail,
and these tests now fail with a verifier error, rather than
miscompiling.
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -407,6 +407,10 @@ LogicalResult TCGen5MMAOp::verify() {
     return emitOpError("The col stride of the return operand must be 32 / ")
            << retType.getElementTypeBitWidth() << " but got "
            << retEnc.getColStride();
+  // The maximum size of a MMA instruction is 128x256
+  if (retEnc.getBlockN() > 256)
+    return emitOpError("The block size of the return operand must be less than "
+                       "or equal to 256");
 
   auto aSplit = getCTASplitNum(aEnc);
   auto bSplit = getCTASplitNum(bEnc);
@@ -422,6 +426,43 @@ LogicalResult TCGen5MMAOp::verify() {
   if (getTwoCtas()) {
     auto retSplit = getCTASplitNum(retEnc);
 
+    auto nPerCTA = retType.getDimSize(1) / retSplit[1];
+
+    // [Note: numRepN > 1 and two_ctas]
+    // Consider, just as an example, num_ctas=16, and a huge tile of shape
+    // MNK = 512x64x2048
+    // This is an example of layout with numRepN=2 and two_ctas=true:
+    // Layout RHS:
+    // #ttg.memdesc<64x2048xf16,
+    //   #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = true,
+    //                      elementBitWidth = 16,
+    //                      CGALayout = [[0, 1], [0, 2], [0, 4], [0, 0]]}>>
+    //
+    // As a LinearLayout:
+    // offset = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 1], [8, 2],
+    //           [16, 4], [0, 8], [0, 16], [0, 32], [0, 64], [0, 128], [32, 0]]
+    // block = [[0, 256], [0, 512], [0, 1024], [0, 0]]
+    //
+    // The issue is that the data from the CTA1 should be next to that of the
+    // first part of the instruction. Now, the max instruction size is 128x256,
+    // so the layout we should use is
+    // offset = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 1], [8, 2],
+    //           [16, 4], [0, 8], [0, 16], [0, 32], [0, 64], [0, 256], [32, 0]]
+    // block = [[0, 128], [0, 512], [0, 1024], [0, 0]]
+    // (note how we swapped the bases [0, 256] and [0, 128])
+    // The issue with this layout is that it breaks the invariant that the
+    // CGALayout splits the CGA tile into contiguous CTA tiles,
+    // i.e. total_layout = cta_layout * cga_layout.
+    // This is used all over the place, to the point that for all legacy layouts
+    // we represent the CGALayout as the `cga_layout` we have to multiply on the
+    // right.
+    // We could allow with a bit of effort SharedLinearLayouts that did not
+    // divide on the right by a CGALayout, but for now we throw a lovely error.
+    if (nPerCTA > 256)
+      return emitOpError(
+          "We don't allow to emit more than one mma instruction along N. "
+          "Reduce the block or increase the number of warps or CTAs along N");
+
     unsigned retM = retSplit[0];
     unsigned retN = retSplit[1];
     if (aSplit[0] != retM) {
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -583,8 +583,9 @@ def min_shape(swizzling, dim0, dim1, trans):
         if M * N // 128 // num_ctas > MAX_ROWS:
             N //= (M * N // 128 // num_ctas // MAX_ROWS)
 
-    if two_ctas and warps != [8, 1] and (shape_m, shape_n, shape_k) == (2, 4, 1):
-        pytest.skip("FIXME: Fails with wrong answer. Need to investigate")
+    if two_ctas and N // ctas_per_cga[1] == 512:
+        # grep for [Note: numRepN > 1 and two_ctas]
+        pytest.skip("grep for [Note: numRepN > 1 and two_ctas]")
 
     assert M >= 64, "M must be at least 64 for mmav3 and mmav5"
 
@@ -671,7 +672,7 @@ def make_2cta_cga_layout(ctas_per_cga, cta_split, cta_order, two_cta_dim):
         shared_layout_b = ttgl.NVMMASharedLayout(swizzle_byte_width=swizzling_b, element_bitwidth=bitwidth, rank=2,
                                                  transposed=transpose_b, cga_layout=cga_layout_b)
     if use_tcgen05:
-        tmem_shape = (min(M // ctas_per_cga[0], 128), N // ctas_per_cga[1])
+        tmem_shape = (min(M // ctas_per_cga[0], 128), min(N // ctas_per_cga[1], 256))
         acc_layout = TensorMemoryLayout(tmem_shape, col_stride=32 // torch.finfo(acc_dtype).bits,
                                         cta_split_num=tuple(ctas_per_cga), two_ctas=two_ctas)
     else:
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAHelpers.h b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAHelpers.h
@@ -157,8 +157,11 @@ class DotOpMmaSmemLoader : public DotOpMmaMemLoader {
     assert(to_vector(ll.getOutDimNames()) ==
            llvm::to_vector(
                ArrayRef<StringAttr>{str_attr("offset"), str_attr("block")}));
-    int32_t totalOffElems = ll.apply({{dims[0], a}, {dims[1], b}})[0].second;
-    int32_t smemByteOffsetb8 = totalOffElems * desc.bitwidth / 8;
+    auto offsetBlock = ll.apply({{dims[0], a}, {dims[1], b}});
+    int32_t offsetElems = offsetBlock[0].second;
+    int32_t block = offsetBlock[1].second;
+    assert(block == 0);
+    int32_t smemByteOffsetb8 = offsetElems * desc.bitwidth / 8;
     auto currDesc = desc.descriptor;
     // Take the next 0/1/2/3 bits after the 128b tile
     uint32_t mask = (desc.swizzlingByteWidth >> 4) - 1;
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp
@@ -415,10 +415,15 @@ void convertDotImpl(const LLVMTypeConverter &typeConverter,
   auto tensorMemAttr =
       cast<ttng::TensorMemoryEncodingAttr>(dTensorTy.getEncoding());
   unsigned mmaSizeM = tensorMemAttr.getBlockM();
-  unsigned mmaSizeN = std::min(tensorMemAttr.getBlockN(), 256u);
+  unsigned mmaSizeN = tensorMemAttr.getBlockN();
+  // Checked in the verifier
+  assert(mmaSizeN <= 256 &&
+         "The maximum size of an MMA instruction is 128x256");
   unsigned mmaSizeK = op.mmaSizeK;
   int numRepM = ceil<unsigned>(M, mmaSizeM);
   int numRepN = ceil<unsigned>(N, mmaSizeN);
+  assert((!twoCTAs || numRepN == 1) &&
+         "grep for [Note: numRepN > 1 and two_ctas]");
   int numRepK = ceil<unsigned>(K, mmaSizeK);
 
   SmallVector<int64_t> shapeA = op.shapeA;
@@ -439,13 +444,9 @@ void convertDotImpl(const LLVMTypeConverter &typeConverter,
 
   auto isFp4b = op.numBitsPerElementB == 4;
   // [Instr shape twoCTAs]
-  // This division by 2 in 2CTA mode a bit subtle:
-  // The issue here is that in 2CTA you multiply in one instruction a tensor
-  // of shape MNK = 256, K, N, and you put it into TMEM of shape 128, K, N*2.
-  // So to compute the shapePerCTA, on the lhs we can look at the TMEM shape,
-  // but to compute the shapePerCTA on the rhs, we need to divide by 2.
-  // Something similar happens when we multiply by 2 the mmaSizeM when creating
-  // It's a massive code smell tho
+  // To compute a output tile of [mmaSizeM, mmaSizeN] in 2CTA mode, we load
+  // an A of size 2 * mmaSizeM x K and a B of size K x mmaSizeN.
+  // Now, since B is split amongs 2 CTAs along N, we need to divide by 2.
   DotOpMmaSmemLoader bLoader = DotOpMmaSmemLoader::build(
       loc, rewriter, bTensorTy, baseB, {mmaSizeK, mmaSizeN / (twoCTAs ? 2 : 1)},
       1, 5, isFp4b);