Remove verification of tmem allocation size (#8782)

neildhar · web-flow · commit 7b2937859216 · 2025-11-24T18:37:48.000Z
This check can fail, because `supportMMA` does not validate that the
result of a dot must be able to fit in tmem before using MMAv5. This
then results in an assertion failure.

Instead of hardcoding this constraint in the optimizer, remove the check
and allow the allocator to run out of tmem later.
diff --git a/lib/Dialect/TritonGPU/IR/Types.cpp b/lib/Dialect/TritonGPU/IR/Types.cpp
@@ -143,14 +143,6 @@ LogicalResult MemDescType::verify(function_ref<InFlightDiagnostic()> emitError,
                          << ll.getOutDimSize(dims[0]) << "x"
                          << ll.getOutDimSize(dims[1]);
     }
-    // Note the following holds for both M=64 and M=128 with 2CTA
-    auto nCol = ll.getInDimSize(StringAttr::get(ctx, "col"));
-    if (nCol / (enc.getCTASplitM() * enc.getCTASplitN()) >
-        512 * 32 / bitwidth) {
-      return emitError() << "nCol / (CTASplitM * CTASplitN) must be less than "
-                            "or equal to 512 * 32 / bitwidth but got "
-                         << nCol / (enc.getCTASplitM() * enc.getCTASplitN());
-    }
   } else if (auto enc = dyn_cast<SharedEncodingTrait>(encoding)) {
     if (memorySpace != SharedMemorySpaceAttr::get(ctx)) {
       return emitError()
diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
@@ -743,3 +743,24 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return %d : tensor<128x128xf32, #blocked3>
   }
 }
+
+// -----
+
+// We previously asserted that a tmem allocation must fit in the available tmem.
+// This would cause an assertion failure if the result matrix was too large.
+// Check that we allow the large result in AccelerateMatmul, and leave it to
+// the allocator to fail later.
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+    // CHECK-LABEL: @res_too_big_for_mmav5
+    tt.func public @res_too_big_for_mmav5(%a: tensor<1024x16xf32, #blocked2>, %b: tensor<16x128xf32, #blocked1>, %c: tensor<1024x128xf32, #blocked>) -> tensor<1024x128xf32, #blocked> {
+        %ad = ttg.convert_layout %a : tensor<1024x16xf32, #blocked2> -> tensor<1024x16xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
+        %bd = ttg.convert_layout %b : tensor<16x128xf32, #blocked1> -> tensor<16x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>
+        // CHECK: ttng.tc_gen5_mma
+        %d = tt.dot %ad, %bd, %c, inputPrecision = tf32 : tensor<1024x16xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<16x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<1024x128xf32, #blocked>
+      tt.return %d : tensor<1024x128xf32, #blocked>
+    }
+}