[4/N][TLX-2cta] Special logic for DotOp verification when tlx is in 2cta mode (#642)

pchen7e2 · meta-codesync[bot] · commit a3ace8177fcc · 2025-11-05T23:09:41.000-08:00
Summary: By default, this is the verifier for output dims generated by `DotOpInterfaceTrait` https://github.com/facebookexperimental/triton/blob/70aa21cb8602116e1feedbf8348609b4f4b568b9/include/triton/Dialect/Triton/IR/TritonOpInterfaces.td#L62-L79 In our implementation, we chose to maintain shapes for A and D but shrink tensor B by half so we need to override the verifier logic if 2cta flag is ON for the mma op. ``` % make test-lit (all passing) % third_party/tlx/run_all.sh Need to build triton in this script? {y|n}n Run all LITs? {y|n}n Run core Triton python unit tests? {y|n}n Run all TLX unit tests? {y|n}y Running TLX Unit Tests ... (all passing or skipped) Run TLX tutorial kernels (correctness|performance|no)? {c|p|n} c Verifying correctness of TLX tutorial kernels (all passing) ``` Pull Request resolved: #642 Reviewed By: htyu Differential Revision: D86337216 Pulled By: pchen7e2 fbshipit-source-id: be2286cc1258d20efa22a0eb2eb92fb6e38b7fc8
diff --git a/include/triton/Dialect/Triton/IR/TritonOpInterfaces.td b/include/triton/Dialect/Triton/IR/TritonOpInterfaces.td
@@ -74,7 +74,7 @@ def DotOpInterface : OpInterface<"DotOpInterface"> {
         auto bShape = bTy.getShape();
         auto cShape = cTy.getShape();
         return cShape[cShape.size() - 2] == aShape[aShape.size() - 2] &&
-               cShape[cShape.size() - 1] == bShape[aShape.size() - 1];
+               cShape[cShape.size() - 1] == bShape[bShape.size() - 1];
       }]>
   ];
 
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -489,7 +489,7 @@ def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {
 
 def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [
     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
-    DeclareOpInterfaceMethods<DotOpInterface>,
+    DeclareOpInterfaceMethods<DotOpInterface, ["verifyOutputDims"]>,
     DeclareOpInterfaceMethods<MMAv5OpInterface>,
     AttrSizedOperandSegments
 ]> {
diff --git a/lib/Dialect/Triton/IR/OpInterfaces.cpp b/lib/Dialect/Triton/IR/OpInterfaces.cpp
@@ -67,8 +67,9 @@ LogicalResult verifyDotOpInterface(Operation *op) {
   if (!dotOp.verifyOutputDims())
     return dotOp->emitOpError(
         "expected the output shape to be the concatenation of the last "
-        "dimension of the first operand and the last dimension of the "
-        "second ");
+        "dimension of the first operand and (the last dimension of the "
+        "second if 1cta; or 2 times the last dimension of the second operand "
+        "if 2cta TLX)");
   return success();
 }
 
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -392,6 +392,29 @@ bool TCGen5MMAOp::verifyDims() {
   return aShape[aShape.size() - 1] == bShape[aShape.size() - 2];
 }
 
+bool TCGen5MMAOp::verifyOutputDims() {
+
+  if (getTwoCtas()) {
+    // Here we have to relax the verification to support two possibilities
+    // - For TLX 2CTA:
+    //  - Full MMA shape: [2M, K] x [K, N] -> [2M, N]
+    //  - Each CTA: [M, K] x [K, N/2] -> [M, N]. We're verifying each CTA here.
+    // - For non TLX 2CTA: each CTA has [M, K] x [K, N] -> [M, N]
+    // We cannot rely on module attr to differentiate them here because this
+    // verification can run before Fixup pass. If we want to be as accurate as
+    // possible, we should have a tlxTwoCTAs flag on MMA Op in the future
+    auto aShape = getA().getType().getShape();
+    auto bShape = getB().getType().getShape();
+    auto dShape = getD().getType().getShape();
+    return dShape[dShape.size() - 2] == aShape[aShape.size() - 2] &&
+           (dShape[dShape.size() - 1] == bShape[bShape.size() - 1] /* non TLX*/
+            || dShape[dShape.size() - 1] ==
+                   2 * bShape[bShape.size() - 1] /* TLX 2CTA*/);
+  }
+  // 1cta case still delegates to default verifiers
+  return DotOpInterfaceTrait::verifyOutputDims();
+}
+
 Value TCGen5MMAOp::useAccumulator() { return getUseD(); }
 
 void TCGen5MMAOp::setUseAccumulator(Value flag) {
diff --git a/test/TLX/attach-metadata.mlir b/test/TLX/attach-metadata.mlir
@@ -180,15 +180,15 @@ module attributes {tlx.has_warp_spec_ops = true, "ttg.num-ctas" = 1 : i32, "ttg.
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   tt.func @tc_gen5_mma(%a: !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>,
-                       %b: !ttg.memdesc<128x128xf16, #shared1, #ttg.shared_memory>,
+                       %b: !ttg.memdesc<128x64xf16, #shared1, #ttg.shared_memory>,
                        %c: !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>,
                        %useAcc: i1,
                        %pred: i1,
                        %barrier: !ttg.memdesc<1xi64, #shared2, #ttg.shared_memory>,
                        %barrierPred: i1) {
     ttng.tc_gen5_mma %a, %b, %c, %useAcc, %pred, %barrier[%barrierPred] {is_async, two_ctas}:
        !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>,
-       !ttg.memdesc<128x128xf16, #shared1, #ttg.shared_memory>,
+       !ttg.memdesc<128x64xf16, #shared1, #ttg.shared_memory>,
        !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>,
        !ttg.memdesc<1xi64, #shared2, #ttg.shared_memory>
     tt.return
diff --git a/test/TLX/tlx-verifier.mlir b/test/TLX/tlx-verifier.mlir
@@ -34,19 +34,20 @@ module attributes {tlx.has_warp_spec_ops = true, "ttg.num-ctas" = 1 : i32, "ttg.
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   tt.func @tc_gen5_mma(%a: !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>,
-                       %b: !ttg.memdesc<128x128xf16, #shared1, #ttg.shared_memory>,
+                       %b1: !ttg.memdesc<128x64xf16, #shared1, #ttg.shared_memory>,
+                       %b2: !ttg.memdesc<128x128xf16, #shared1, #ttg.shared_memory>,
                        %c: !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>,
                        %useAcc: i1,
                        %pred: i1,
                        %barrier: !ttg.memdesc<1xi64, #shared2, #ttg.shared_memory>,
                        %barrierPred: i1) {
-    ttng.tc_gen5_mma %a, %b, %c, %useAcc, %pred, %barrier[%barrierPred] {is_async, two_ctas}:
+    ttng.tc_gen5_mma %a, %b1, %c, %useAcc, %pred, %barrier[%barrierPred] {is_async, two_ctas}:
        !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>,
-       !ttg.memdesc<128x128xf16, #shared1, #ttg.shared_memory>,
+       !ttg.memdesc<128x64xf16, #shared1, #ttg.shared_memory>,
        !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>,
        !ttg.memdesc<1xi64, #shared2, #ttg.shared_memory>
     // expected-error @+1 {{Expecting all dot ops to be 2cta together}}
-    ttng.tc_gen5_mma %a, %b, %c, %useAcc, %pred, %barrier[%barrierPred] {is_async}:
+    ttng.tc_gen5_mma %a, %b2, %c, %useAcc, %pred, %barrier[%barrierPred] {is_async}:
            !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>,
            !ttg.memdesc<128x128xf16, #shared1, #ttg.shared_memory>,
            !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>,