[TritonNVIDIAGPU] Revert MMAv5 write effect on barrier (triton-lang#6484)

Mogball · web-flow · commit f8a19d1e9b2f · 2025-04-14T18:50:37.000Z
This is a tiny partial revert of triton-lang#6476, removing the addition `MemWrite<SharedMemory>` on the barrier operand of the MMAv5 ops. This wasn't present before. This is causing extra barriers to be inserted between MMA ops and mbarrier waits. This was causing large performance regressions across the board. However, as things stand, the compiler can't see interfering memory effects between the MMAv5 ops and barrier waits. We will probably have to model the side effects here in a different way. Technically, the MMAv5 ops don't write or read the barrier, but push a request onto the MMAv5 pipeline.
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -208,10 +208,6 @@ static void getMMAEffects(
     effects.emplace_back(MemoryEffects::Read::get(), &op.getAMutable(),
                          TensorMemory::get());
   }
-  if (op.getBarrier()) {
-    effects.emplace_back(MemoryEffects::Write::get(),
-                         op.getBarrierMutable().begin(), SharedMemory::get());
-  }
 
   effects.emplace_back(MemoryEffects::Read::get(), &op.getBMutable(),
                        SharedMemory::get());
diff --git a/test/TritonNvidiaGPU/membar.mlir b/test/TritonNvidiaGPU/membar.mlir
@@ -120,3 +120,38 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 256, 32]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 32, transposed = false, elementBitWidth = 16}>
+#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 32, transposed = true, elementBitWidth = 16}>
+#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
+#smem = #ttg.shared_memory
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
+
+// CHECK-LABEL: @wait_after_mma
+tt.func @wait_after_mma(
+  %a: !ttg.memdesc<128x128xf16, #shared, #smem>,
+  %b: !ttg.memdesc<128x128xf16, #shared1, #smem>,
+  %c: !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>,
+  %useAcc: i1,
+  %pred: i1
+) {
+  %phase = arith.constant 0 : i32
+  %barrier = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared2, #smem, mutable>
+  // CHECK: ttng.tc_gen5_mma
+  ttng.tc_gen5_mma %a, %b, %c, %useAcc, %pred, %barrier :
+   (!ttg.memdesc<128x128xf16, #shared, #smem>,
+     !ttg.memdesc<128x128xf16, #shared1, #smem>,
+     !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>,
+     i1, i1,
+     !ttg.memdesc<1xi64, #shared2, #smem, mutable>) -> ()
+  // CHECK-NEXT: ttng.wait_barrier
+  ttng.wait_barrier %barrier, %phase : !ttg.memdesc<1xi64, #shared2, #smem, mutable>
+  tt.return
+}
+
+}