[NVIDIA] Use correct commit type for TMA (#5738)

peterbell10 · web-flow · commit 0c7edf91f0c1 · 2025-01-29T13:06:33.000Z
Follow-up to #5733 which somehow the passed CI and auto-merged, even
with this bug. TMA should be using `bulk.commit.group` rather than
`commit.group`.
diff --git a/test/Conversion/tma_to_llvm.mlir b/test/Conversion/tma_to_llvm.mlir
@@ -168,7 +168,7 @@ tt.func @tma_scatter(%arg0: !tt.ptr<i8>, %arg1: tensor<32xi32, #ttg.slice<{dim =
   // CHECK-SAME: (i1 [[PRED]], ptr addrspace(1) %0, i32 %2, i32 {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}}, ptr addrspace(3) [[PTR]])
   ttng.async_tma_scatter %arg0[%arg1, %arg2] %arg3 : !tt.ptr<i8>, tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>>, i32, !ttg.memdesc<32x128xbf16, #shared1, #smem, mutable>
 
-  // CHECK: call void @llvm.nvvm.cp.async.commit.group()
+  // CHECK: nvvm.cp.async.bulk.commit.group()
 
   // CHECK-NEXT: ret void
   tt.return
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1605,7 +1605,7 @@ LogicalResult AsyncTMAScatterOpConversion::matchAndRewrite(
 
   // TODO: Separate the syncronizations operations into separate TTGIR ops to
   // be able to schedule them at the high level.
-  rewriter.create<NVVM::CpAsyncCommitGroupOp>(loc);
+  rewriter.create<NVVM::CpAsyncBulkCommitGroupOp>(loc);
 
   rewriter.eraseOp(op);
   return success();