From 5a30326effddfa17543224bfeed474c117ea99e4 Mon Sep 17 00:00:00 2001
From: Stefan Mada <smada@nvidia.com>
Date: Wed, 3 Sep 2025 17:57:54 +0000
Subject: [PATCH 1/8] Update summary and description fields for Barrier Ops in
 NVVMOps.td

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 263 +++++++++++++++++++-
 1 file changed, 260 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 9d93b4efe7a5b..5338bb2336ed9 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -494,6 +494,26 @@ def NVVM_PMEventOp : NVVM_PTXBuilder_Op<"pmevent">,
 /// mbarrier.init instruction with generic pointer type
 def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
   Arguments<(ins LLVM_AnyPointer:$addr, I32:$count, PtxPredicate:$predicate)> {
+  let summary = "MBarrier Initialization Op";
+  let description = [{
+    The `nvvm.mbarrier.init` operation initializes an mbarrier object at the specified 
+    memory location. 
+
+    This operation initializes the mbarrier object with the following state:
+    - Current phase: 0
+    - Expected arrival count: `count`
+    - Pending arrival count: `count`  
+    - Transaction count (tx-count): 0
+
+    The operation takes the following operands:
+    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+      addressing, but the address must still be in the shared memory space.
+    - `count`: Integer specifying the number of threads that will participate in barrier
+      synchronization. Must be in the range [1, 2²⁰ - 1].
+    - `predicate`: Optional predicate for conditional execution.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init)
+  }];
   string llvmBuilder = [{
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init, {$addr, $count});
   }];
@@ -509,6 +529,13 @@ def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
 /// mbarrier.init instruction with shared pointer type
 def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared", [NVVMRequiresSM<80>, DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
   Arguments<(ins LLVM_PointerShared:$addr, I32:$count, PtxPredicate:$predicate)> {
+  let summary = "Shared MBarrier Initialization Op";
+  let description = [{
+    This Op is the same as the generic memory variant except that the mbarrier object
+    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init)
+  }];
   string llvmBuilder = [{
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init_shared, {$addr, $count});
   }];
@@ -521,6 +548,21 @@ def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared", [NVVM
 
 def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
   Arguments<(ins LLVM_AnyPointer:$addr)> {
+  let summary = "MBarrier Invalidation Operation";
+  let description = [{
+    The `nvvm.mbarrier.inval` operation invalidates an mbarrier object at the 
+    specified memory location. 
+
+    This operation marks the mbarrier object as invalid, making it safe to repurpose 
+    the memory location for other uses or to reinitialize it as a new mbarrier object.
+    It is undefined behavior if the mbarrier object is already invalid.
+    
+    The operation takes the following operand:
+    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+      addressing, but the address must still be in the shared memory space.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval)
+  }];
   string llvmBuilder = [{
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval, {$addr});
   }];
@@ -529,6 +571,13 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
 
 def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">,
   Arguments<(ins LLVM_PointerShared:$addr)> {
+  let summary = "Shared MBarrier Invalidation Operation";
+  let description = [{
+    This Op is the same as the generic memory variant except that the mbarrier object
+    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval)
+  }];
   string llvmBuilder = [{
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval_shared, {$addr});
   }];
@@ -538,6 +587,22 @@ def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">,
 def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
   Results<(outs LLVM_Type:$res)>,
   Arguments<(ins LLVM_AnyPointer:$addr)> {
+  let summary = "MBarrier Arrive Operation";
+  let description = [{
+    The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the 
+    mbarrier object at the specified address. Uses the default `.release.cta` semantics.
+
+    This operation causes the executing thread to signal its arrival at the barrier.
+    The operation returns an opaque 64-bit value that captures the phase of the 
+    mbarrier object prior to the arrive-on operation. The contents of this state 
+    value are implementation-specific.
+
+    The operation takes the following operand:
+    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+      addressing, but the address must still be in the shared memory space.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  }];
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive, {$addr});
   }];
@@ -547,6 +612,13 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
 def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
   Results<(outs LLVM_Type:$res)>,
   Arguments<(ins LLVM_PointerShared:$addr)> {
+  let summary = "Shared MBarrier Arrive Operation";
+  let description = [{
+    This Op is the same as the generic memory variant except that the mbarrier object
+    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  }];
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_shared, {$addr});
   }];
@@ -556,6 +628,25 @@ def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
 def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
   Results<(outs LLVM_Type:$res)>,
   Arguments<(ins LLVM_AnyPointer:$addr, I32:$count)> {
+  let summary = "MBarrier Arrive No-Complete Operation";
+  let description = [{
+    The `nvvm.mbarrier.arrive.nocomplete` operation performs an arrive-on operation 
+    on the mbarrier object with the guarantee that it will not cause the barrier to 
+    complete its current phase. Uses the default `.release.cta` semantics.
+
+    This operation causes the executing thread to signal its arrival at the barrier 
+    with a specified count, but ensures that the barrier phase will not complete as 
+    a result of this operation. The operation returns an opaque 64-bit value that 
+    captures the phase of the mbarrier object prior to the arrive-on operation.
+
+    The operation takes the following operands:
+    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+      addressing, but the address must still be in the shared memory space.
+    - `count`: Integer specifying the count argument to the arrive-on operation. 
+      Must be in the valid range as specified in the mbarrier object contents.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  }];
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete, {$addr, $count});
   }];
@@ -565,6 +656,13 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
 def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.shared">,
   Results<(outs LLVM_Type:$res)>,
   Arguments<(ins LLVM_PointerShared:$addr, I32:$count)> {
+  let summary = "Shared MBarrier Arrive No-Complete Operation";
+  let description = [{
+    This Op is the same as the generic memory variant except that the mbarrier object
+    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  }];
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared, {$addr, $count});
   }];
@@ -573,6 +671,26 @@ def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.
 
 def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,  
   Arguments<(ins LLVM_AnyPointer:$addr, I32:$txcount, PtxPredicate:$predicate)> {
+  let summary = "MBarrier Arrive with Expected Transaction Count";
+  let description = [{
+    The `nvvm.mbarrier.arrive.expect_tx` operation performs an expect-tx operation 
+    followed by an arrive-on operation on the mbarrier object. Uses the default 
+    `.release.cta` semantics.
+
+    This operation first performs an expect-tx operation with the specified transaction 
+    count, then performs an arrive-on operation with an implicit count of 1. The 
+    expect-tx operation updates the expected transaction count for the barrier.
+
+    The operation takes the following operands:
+    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+      addressing, but the address must still be in the shared memory space.
+    - `txcount`: An unsigned integer specifying the expected transaction count 
+      for the expect-tx operation. This represents the number of asynchronous transactions 
+      expected to complete before the barrier phase completes.
+    - `predicate`: Optional predicate for conditional execution.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  }];
   let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() { return std::string("mbarrier.arrive.expect_tx.b64 _, [%0], %1;"); }
@@ -580,7 +698,14 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
 }
 
 def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx.shared">,  
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> {    
+  Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> {
+  let summary = "Shared MBarrier Arrive with Expected Transaction Count";
+  let description = [{
+    This Op is the same as the generic memory variant except that the mbarrier object
+    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  }];    
   let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() { return std::string("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"); }
@@ -588,7 +713,36 @@ def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.ex
 }
 
 def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,  
-  Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> {  
+  Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> {
+  let summary = "MBarrier Potentially-Blocking Try Wait with Phase Parity";
+  let description = [{
+    The `nvvm.mbarrier.try_wait.parity` operation performs a potentially-blocking 
+    test for the completion of a specific phase of an mbarrier object using phase 
+    parity. It uses the default `.acquire.cta` semantics.
+
+    This operation waits for the completion of the mbarrier phase indicated by the 
+    phase parity. While it uses the underlying PTX `mbarrier.try_wait.parity` 
+    instruction, this MLIR operation generates a loop that enforces the test to 
+    complete before continuing execution, ensuring the barrier phase is actually 
+    completed rather than potentially timing out.
+
+    The operation takes the following operands:
+    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+      addressing, but the address must still be in the shared memory space.
+    - `phase`: An integer specifying the phase parity (0 or 1). Even phases 
+      have parity 0, odd phases have parity 1.
+    - `ticks`: An unsigned integer specifying the suspend time hint in 
+      nanoseconds. This may be used instead of the system-dependent time limit.
+
+    **Implementation behavior**:
+    This operation generates a PTX loop that repeatedly calls the underlying 
+    `mbarrier.try_wait.parity` instruction until the barrier phase completes. 
+    Unlike the raw PTX instruction which may return without completion after a 
+    timeout, this MLIR operation guarantees completion by continuing to loop until 
+    the specified phase is reached.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
+  }];  
   let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -607,7 +761,14 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
 }
 
 def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity.shared">,  
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> {  
+  Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> {
+  let summary = "Shared MBarrier Potentially-Blocking Try Wait with Phase Parity";
+  let description = [{
+    This Op is the same as the generic memory variant except that the mbarrier object
+    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
+  }];  
   let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -628,6 +789,30 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p
 def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
   Results<(outs LLVM_Type:$res)>,
   Arguments<(ins LLVM_AnyPointer:$addr, LLVM_Type:$state)> {
+  let summary = "MBarrier Non-Blocking Test Wait Operation";
+  let description = [{
+    The `nvvm.mbarrier.test.wait` operation performs a non-blocking test for the 
+    completion of a specific phase of an mbarrier object. It uses the default
+    `.acquire.cta` semantics.
+
+    This operation tests whether the mbarrier phase specified by the state operand 
+    has completed. It is a non-blocking instruction that immediately returns the 
+    completion status without suspending the executing thread.
+
+    The operation takes the following operands:
+    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+      addressing, but the address must still be in the shared memory space.
+    - `state`: An opaque value returned by a previous `mbarrier.arrive` 
+      operation on the same mbarrier object during the current or immediately 
+      preceding phase.
+
+    The operation returns a boolean value indicating whether the specified phase 
+    has completed:
+    - `true`: The immediately preceding phase has completed
+    - `false`: The phase is still incomplete (current phase)
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
+  }];
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $state});
   }];
@@ -637,6 +822,13 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
 def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">,
   Results<(outs LLVM_Type:$res)>,
   Arguments<(ins LLVM_PointerShared:$addr, LLVM_Type:$state)> {
+  let summary = "Shared MBarrier Non-Blocking Test Wait Operation";
+  let description = [{
+    This Op is the same as the generic memory variant except that the mbarrier object
+    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
+  }];
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $state});
   }];
@@ -648,6 +840,15 @@ def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">,
 //===----------------------------------------------------------------------===//
 
 def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
+  let summary = "CTA Barrier Synchronization Op (Barrier ID 0)";
+  let description = [{
+    The `nvvm.barrier0` operation is a convenience operation that performs barrier 
+    synchronization and communication within a CTA (Cooperative Thread Array) using 
+    barrier ID 0. It is functionally equivalent to `nvvm.barrier` with `barrierId = 0`. 
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier)
+  }];
+
   let assemblyFormat = "attr-dict";
   string llvmBuilder = [{
       createIntrinsicCall(
@@ -657,6 +858,34 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
 }
 
 def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
+  let summary = "CTA Barrier Synchronization Op";
+  let description = [{
+    The `nvvm.barrier` operation performs barrier synchronization and communication 
+    within a CTA (Cooperative Thread Array). It causes executing threads to wait for 
+    all non-exited threads participating in the barrier to arrive.
+
+    The operation takes two optional operands:
+
+    - `barrierId`: Specifies a logical barrier resource with value 0 through 15. 
+      Each CTA instance has sixteen barriers numbered 0..15. Defaults to 0 if not specified.
+    - `numberOfThreads`: Specifies the number of threads participating in the barrier. 
+      When specified, the value must be a multiple of the warp size. If not specified, 
+      all threads in the CTA participate in the barrier.
+
+    The barrier operation guarantees that when the barrier completes, prior memory 
+    accesses requested by participating threads are performed relative to all threads 
+    participating in the barrier. It also ensures that no new memory access is 
+    requested by participating threads before the barrier completes.
+
+    When a barrier completes, the waiting threads are restarted without delay, and 
+    the barrier is reinitialized so that it can be immediately reused.
+
+    This operation generates the `.aligned` version of the PTX barrier instruction, 
+    indicating that all threads in the CTA execute the same barrier instruction.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier)
+  }];
+
   let arguments = (ins     
     Optional<I32>:$barrierId,
     Optional<I32>:$numberOfThreads);
@@ -1046,6 +1275,34 @@ def NVVM_VoteSyncOp
 def NVVM_SyncWarpOp :
   NVVM_Op<"bar.warp.sync">,
   Arguments<(ins LLVM_Type:$mask)> {
+  let summary = "Warp Barrier Synchronization Op";
+  let description = [{
+    The `nvvm.bar.warp.sync` operation performs barrier synchronization for threads 
+    within a warp. 
+
+    This operation causes the executing thread to wait until all threads corresponding 
+    to the `mask` operand have executed a `bar.warp.sync` with the same mask value 
+    before resuming execution.
+
+    The `mask` operand specifies a 32-bit integer mask indicating threads participating 
+    in the barrier, where each bit position corresponds to the thread's lane ID within 
+    the warp. Only threads with their corresponding bit set in the mask participate 
+    in the barrier synchronization.
+
+    **Important constraints**:
+    - The behavior is undefined if the executing thread is not included in the mask 
+      (i.e., the bit corresponding to the thread's lane ID is not set)
+    - For compute capability sm_6x or below, all threads in the mask must execute 
+      the same `bar.warp.sync` instruction in convergence
+
+    This operation also guarantees memory ordering among participating threads. 
+    Threads within the warp that wish to communicate via memory can store to memory, 
+    execute `bar.warp.sync`, and then safely read values stored by other threads 
+    in the warp.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync)
+  }];
+
   string llvmBuilder = [{
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_bar_warp_sync, {$mask});
   }];

From 6f796184fc2947461afe8e48fe85e4d3f26b8d6b Mon Sep 17 00:00:00 2001
From: Stefan Mada <smada@nvidia.com>
Date: Thu, 4 Sep 2025 16:59:41 +0000
Subject: [PATCH 2/8] Italicized barrier object text, clarified shared memory
 variant descriptions

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 70 ++++++++++-----------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 5338bb2336ed9..c824d05060ac8 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -496,17 +496,17 @@ def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
   Arguments<(ins LLVM_AnyPointer:$addr, I32:$count, PtxPredicate:$predicate)> {
   let summary = "MBarrier Initialization Op";
   let description = [{
-    The `nvvm.mbarrier.init` operation initializes an mbarrier object at the specified 
+    The `nvvm.mbarrier.init` operation initializes an *mbarrier object* at the specified 
     memory location. 
 
-    This operation initializes the mbarrier object with the following state:
+    This operation initializes the *mbarrier object* with the following state:
     - Current phase: 0
     - Expected arrival count: `count`
     - Pending arrival count: `count`  
     - Transaction count (tx-count): 0
 
     The operation takes the following operands:
-    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
       addressing, but the address must still be in the shared memory space.
     - `count`: Integer specifying the number of threads that will participate in barrier
       synchronization. Must be in the range [1, 2²⁰ - 1].
@@ -531,7 +531,7 @@ def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared", [NVVM
   Arguments<(ins LLVM_PointerShared:$addr, I32:$count, PtxPredicate:$predicate)> {
   let summary = "Shared MBarrier Initialization Op";
   let description = [{
-    This Op is the same as the generic memory variant except that the mbarrier object
+    This Op is the same as `nvvm.mbarrier.init` except that the *mbarrier object*
     should be accessed using a shared-memory pointer instead of a generic-memory pointer.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init)
@@ -550,15 +550,15 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
   Arguments<(ins LLVM_AnyPointer:$addr)> {
   let summary = "MBarrier Invalidation Operation";
   let description = [{
-    The `nvvm.mbarrier.inval` operation invalidates an mbarrier object at the 
+    The `nvvm.mbarrier.inval` operation invalidates an *mbarrier object* at the 
     specified memory location. 
 
-    This operation marks the mbarrier object as invalid, making it safe to repurpose 
-    the memory location for other uses or to reinitialize it as a new mbarrier object.
-    It is undefined behavior if the mbarrier object is already invalid.
+    This operation marks the *mbarrier object* as invalid, making it safe to repurpose 
+    the memory location for other uses or to reinitialize it as a new *mbarrier object*.
+    It is undefined behavior if the *mbarrier object* is already invalid.
     
     The operation takes the following operand:
-    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
       addressing, but the address must still be in the shared memory space.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval)
@@ -573,7 +573,7 @@ def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">,
   Arguments<(ins LLVM_PointerShared:$addr)> {
   let summary = "Shared MBarrier Invalidation Operation";
   let description = [{
-    This Op is the same as the generic memory variant except that the mbarrier object
+    This Op is the same as `nvvm.mbarrier.inval` except that the *mbarrier object*
     should be accessed using a shared-memory pointer instead of a generic-memory pointer.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval)
@@ -590,15 +590,15 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
   let summary = "MBarrier Arrive Operation";
   let description = [{
     The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the 
-    mbarrier object at the specified address. Uses the default `.release.cta` semantics.
+    *mbarrier object* at the specified address. Uses the default `.release.cta` semantics.
 
     This operation causes the executing thread to signal its arrival at the barrier.
     The operation returns an opaque 64-bit value that captures the phase of the 
-    mbarrier object prior to the arrive-on operation. The contents of this state 
+    *mbarrier object* prior to the arrive-on operation. The contents of this state 
     value are implementation-specific.
 
     The operation takes the following operand:
-    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
       addressing, but the address must still be in the shared memory space.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
@@ -614,7 +614,7 @@ def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
   Arguments<(ins LLVM_PointerShared:$addr)> {
   let summary = "Shared MBarrier Arrive Operation";
   let description = [{
-    This Op is the same as the generic memory variant except that the mbarrier object
+    This Op is the same as `nvvm.mbarrier.arrive` except that the *mbarrier object*
     should be accessed using a shared-memory pointer instead of a generic-memory pointer.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
@@ -631,19 +631,19 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
   let summary = "MBarrier Arrive No-Complete Operation";
   let description = [{
     The `nvvm.mbarrier.arrive.nocomplete` operation performs an arrive-on operation 
-    on the mbarrier object with the guarantee that it will not cause the barrier to 
+    on the *mbarrier object* with the guarantee that it will not cause the barrier to 
     complete its current phase. Uses the default `.release.cta` semantics.
 
     This operation causes the executing thread to signal its arrival at the barrier 
     with a specified count, but ensures that the barrier phase will not complete as 
     a result of this operation. The operation returns an opaque 64-bit value that 
-    captures the phase of the mbarrier object prior to the arrive-on operation.
+    captures the phase of the *mbarrier object* prior to the arrive-on operation.
 
     The operation takes the following operands:
-    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
       addressing, but the address must still be in the shared memory space.
     - `count`: Integer specifying the count argument to the arrive-on operation. 
-      Must be in the valid range as specified in the mbarrier object contents.
+      Must be in the valid range as specified in the *mbarrier object* contents.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
   }];
@@ -658,7 +658,7 @@ def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.
   Arguments<(ins LLVM_PointerShared:$addr, I32:$count)> {
   let summary = "Shared MBarrier Arrive No-Complete Operation";
   let description = [{
-    This Op is the same as the generic memory variant except that the mbarrier object
+    This Op is the same as `nvvm.mbarrier.arrive.nocomplete` except that the *mbarrier object*
     should be accessed using a shared-memory pointer instead of a generic-memory pointer.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
@@ -674,7 +674,7 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
   let summary = "MBarrier Arrive with Expected Transaction Count";
   let description = [{
     The `nvvm.mbarrier.arrive.expect_tx` operation performs an expect-tx operation 
-    followed by an arrive-on operation on the mbarrier object. Uses the default 
+    followed by an arrive-on operation on the *mbarrier object*. Uses the default 
     `.release.cta` semantics.
 
     This operation first performs an expect-tx operation with the specified transaction 
@@ -682,7 +682,7 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
     expect-tx operation updates the expected transaction count for the barrier.
 
     The operation takes the following operands:
-    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
       addressing, but the address must still be in the shared memory space.
     - `txcount`: An unsigned integer specifying the expected transaction count 
       for the expect-tx operation. This represents the number of asynchronous transactions 
@@ -701,7 +701,7 @@ def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.ex
   Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> {
   let summary = "Shared MBarrier Arrive with Expected Transaction Count";
   let description = [{
-    This Op is the same as the generic memory variant except that the mbarrier object
+    This Op is the same as `nvvm.mbarrier.arrive.expect_tx` except that the *mbarrier object*
     should be accessed using a shared-memory pointer instead of a generic-memory pointer.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
@@ -717,7 +717,7 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
   let summary = "MBarrier Potentially-Blocking Try Wait with Phase Parity";
   let description = [{
     The `nvvm.mbarrier.try_wait.parity` operation performs a potentially-blocking 
-    test for the completion of a specific phase of an mbarrier object using phase 
+    test for the completion of a specific phase of an *mbarrier object* using phase 
     parity. It uses the default `.acquire.cta` semantics.
 
     This operation waits for the completion of the mbarrier phase indicated by the 
@@ -727,7 +727,7 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
     completed rather than potentially timing out.
 
     The operation takes the following operands:
-    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
       addressing, but the address must still be in the shared memory space.
     - `phase`: An integer specifying the phase parity (0 or 1). Even phases 
       have parity 0, odd phases have parity 1.
@@ -764,7 +764,7 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p
   Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> {
   let summary = "Shared MBarrier Potentially-Blocking Try Wait with Phase Parity";
   let description = [{
-    This Op is the same as the generic memory variant except that the mbarrier object
+    This Op is the same as `nvvm.mbarrier.try_wait.parity` except that the *mbarrier object*
     should be accessed using a shared-memory pointer instead of a generic-memory pointer.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
@@ -792,7 +792,7 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
   let summary = "MBarrier Non-Blocking Test Wait Operation";
   let description = [{
     The `nvvm.mbarrier.test.wait` operation performs a non-blocking test for the 
-    completion of a specific phase of an mbarrier object. It uses the default
+    completion of a specific phase of an *mbarrier object*. It uses the default
     `.acquire.cta` semantics.
 
     This operation tests whether the mbarrier phase specified by the state operand 
@@ -800,10 +800,10 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
     completion status without suspending the executing thread.
 
     The operation takes the following operands:
-    - `addr`: A pointer to the memory location of the mbarrier object. Uses generic 
+    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
       addressing, but the address must still be in the shared memory space.
     - `state`: An opaque value returned by a previous `mbarrier.arrive` 
-      operation on the same mbarrier object during the current or immediately 
+      operation on the same *mbarrier object* during the current or immediately 
       preceding phase.
 
     The operation returns a boolean value indicating whether the specified phase 
@@ -824,7 +824,7 @@ def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">,
   Arguments<(ins LLVM_PointerShared:$addr, LLVM_Type:$state)> {
   let summary = "Shared MBarrier Non-Blocking Test Wait Operation";
   let description = [{
-    This Op is the same as the generic memory variant except that the mbarrier object
+    This Op is the same as `nvvm.mbarrier.test.wait` except that the *mbarrier object*
     should be accessed using a shared-memory pointer instead of a generic-memory pointer.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
@@ -1400,9 +1400,9 @@ def NVVM_CpAsyncWaitGroupOp : NVVM_Op<"cp.async.wait.group">,
 def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> {
   let summary = "NVVM Dialect Op for cp.async.mbarrier.arrive";
   let description = [{
-    The `cp.async.mbarrier.arrive` Op makes the mbarrier object track
+    The `cp.async.mbarrier.arrive` Op makes the *mbarrier object* track
     all prior cp.async operations initiated by the executing thread.
-    The `addr` operand specifies the address of the mbarrier object
+    The `addr` operand specifies the address of the *mbarrier object*
     in generic address space. The `noinc` attr impacts how the
     mbarrier's state is updated.
     
@@ -1425,9 +1425,9 @@ def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> {
 def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.shared"> {
   let summary = "NVVM Dialect Op for cp.async.mbarrier.arrive.shared";
   let description = [{
-    The `cp.async.mbarrier.arrive.shared` Op makes the mbarrier object
+    The `cp.async.mbarrier.arrive.shared` Op makes the *mbarrier object*
     track all prior cp.async operations initiated by the executing thread.
-    The `addr` operand specifies the address of the mbarrier object in
+    The `addr` operand specifies the address of the *mbarrier object* in
     shared memory. The `noinc` attr impacts how the mbarrier's state
     is updated. 
     
@@ -3759,10 +3759,10 @@ def NVVM_Tcgen05WaitOp : NVVM_Op<"tcgen05.wait", [NVVMRequiresSMa<[100, 101]>]>
 def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit", [NVVMRequiresSMa<[100, 101]>]> {
   let summary = "Tcgen05 commit operations";
   let description = [{
-    The `tcgen05.commit` makes the mbarrier object, specified by
+    The `tcgen05.commit` makes the *mbarrier object*, specified by
     the operand `addr`, track the completion of all the prior
     async-tcgen05 operations initiated by the executing thread.
-    The multicast variants allow signaling on the mbarrier objects
+    The multicast variants allow signaling on the *mbarrier objects*
     of multiple CTAs within the cluster. Operand `multicastMask`,
     when present, specifies the destination CTAs in the cluster such
     that each bit position in the 16-bit `multicastMask` operand

From f2e93e156ea5713d7c808ed9b5e26cec587c65e7 Mon Sep 17 00:00:00 2001
From: Stefan Mada <smada@nvidia.com>
Date: Thu, 4 Sep 2025 17:17:02 +0000
Subject: [PATCH 3/8] Fixed argument and return types to be more specific types
 for mbarrier / barrier docs

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 23 ++++++++++-----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index c824d05060ac8..1f7343ae0706a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -585,7 +585,7 @@ def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">,
 }
 
 def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
-  Results<(outs LLVM_Type:$res)>,
+  Results<(outs I64:$res)>,
   Arguments<(ins LLVM_AnyPointer:$addr)> {
   let summary = "MBarrier Arrive Operation";
   let description = [{
@@ -593,7 +593,7 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
     *mbarrier object* at the specified address. Uses the default `.release.cta` semantics.
 
     This operation causes the executing thread to signal its arrival at the barrier.
-    The operation returns an opaque 64-bit value that captures the phase of the 
+    The operation returns an opaque value that captures the phase of the 
     *mbarrier object* prior to the arrive-on operation. The contents of this state 
     value are implementation-specific.
 
@@ -610,7 +610,7 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
 }
 
 def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
-  Results<(outs LLVM_Type:$res)>,
+  Results<(outs I64:$res)>,
   Arguments<(ins LLVM_PointerShared:$addr)> {
   let summary = "Shared MBarrier Arrive Operation";
   let description = [{
@@ -626,7 +626,7 @@ def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
 }
 
 def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
-  Results<(outs LLVM_Type:$res)>,
+  Results<(outs I64:$res)>,
   Arguments<(ins LLVM_AnyPointer:$addr, I32:$count)> {
   let summary = "MBarrier Arrive No-Complete Operation";
   let description = [{
@@ -636,7 +636,7 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
 
     This operation causes the executing thread to signal its arrival at the barrier 
     with a specified count, but ensures that the barrier phase will not complete as 
-    a result of this operation. The operation returns an opaque 64-bit value that 
+    a result of this operation. The operation returns an opaque value that 
     captures the phase of the *mbarrier object* prior to the arrive-on operation.
 
     The operation takes the following operands:
@@ -654,7 +654,7 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
 }
 
 def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.shared">,
-  Results<(outs LLVM_Type:$res)>,
+  Results<(outs I64:$res)>,
   Arguments<(ins LLVM_PointerShared:$addr, I32:$count)> {
   let summary = "Shared MBarrier Arrive No-Complete Operation";
   let description = [{
@@ -844,7 +844,7 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
   let description = [{
     The `nvvm.barrier0` operation is a convenience operation that performs barrier 
     synchronization and communication within a CTA (Cooperative Thread Array) using 
-    barrier ID 0. It is functionally equivalent to `nvvm.barrier` with `barrierId = 0`. 
+    barrier ID 0. It is functionally equivalent to `nvvm.barrier` or `nvvm.barrier id=0`. 
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier)
   }];
@@ -1274,7 +1274,7 @@ def NVVM_VoteSyncOp
 
 def NVVM_SyncWarpOp :
   NVVM_Op<"bar.warp.sync">,
-  Arguments<(ins LLVM_Type:$mask)> {
+  Arguments<(ins I32:$mask)> {
   let summary = "Warp Barrier Synchronization Op";
   let description = [{
     The `nvvm.bar.warp.sync` operation performs barrier synchronization for threads 
@@ -1284,10 +1284,9 @@ def NVVM_SyncWarpOp :
     to the `mask` operand have executed a `bar.warp.sync` with the same mask value 
     before resuming execution.
 
-    The `mask` operand specifies a 32-bit integer mask indicating threads participating 
-    in the barrier, where each bit position corresponds to the thread's lane ID within 
-    the warp. Only threads with their corresponding bit set in the mask participate 
-    in the barrier synchronization.
+    The `mask` operand specifies the threads participating in the barrier, where each 
+    bit position corresponds to the thread's lane ID within the warp. Only threads with 
+    their corresponding bit set in the mask participate in the barrier synchronization.
 
     **Important constraints**:
     - The behavior is undefined if the executing thread is not included in the mask 

From e35f494503f52a23eca2a388a70acd3afd222140 Mon Sep 17 00:00:00 2001
From: Stefan Mada <smada@nvidia.com>
Date: Thu, 4 Sep 2025 18:07:09 +0000
Subject: [PATCH 4/8] Explained .acquire.cta semantics in instruction docs,
 clarified ordering guarantees for try_wait and test.wait

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 48 ++++++++++++++++++++-
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 1f7343ae0706a..adb39a746f4e6 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -718,7 +718,12 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
   let description = [{
     The `nvvm.mbarrier.try_wait.parity` operation performs a potentially-blocking 
     test for the completion of a specific phase of an *mbarrier object* using phase 
-    parity. It uses the default `.acquire.cta` semantics.
+    parity. It uses the default `.acquire.cta` semantics. This acquire pattern 
+    establishes memory ordering for operations occurring in program order after this 
+    wait instruction by making operations from other threads visible to subsequent 
+    operations in the current thread. When this wait completes, it synchronizes with 
+    the corresponding release pattern from the `mbarrier.arrive` operation, establishing 
+    memory ordering within the CTA.
 
     This operation waits for the completion of the mbarrier phase indicated by the 
     phase parity. While it uses the underlying PTX `mbarrier.try_wait.parity` 
@@ -734,6 +739,23 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
     - `ticks`: An unsigned integer specifying the suspend time hint in 
       nanoseconds. This may be used instead of the system-dependent time limit.
 
+    **Memory ordering guarantees**: When this wait returns true, the following 
+    ordering guarantees hold:
+
+    1. All memory accesses (except async operations) requested prior to
+       `mbarrier.arrive` having release semantics by participating CTA threads
+       are visible to the executing thread.
+    2. All `cp.async` operations requested prior to `cp.async.mbarrier.arrive`
+       by participating CTA threads are visible to the executing thread.
+    3. All `cp.async.bulk` operations using the same mbarrier object requested
+       prior to `mbarrier.arrive` having release semantics by participating CTA
+       threads are visible to the executing thread.
+    4. Memory accesses requested after this wait are not visible to memory
+       accesses performed prior to `mbarrier.arrive` by other participating
+       threads.
+    5. No ordering guarantee exists for memory accesses by the same thread
+       between `mbarrier.arrive` and this wait.
+
     **Implementation behavior**:
     This operation generates a PTX loop that repeatedly calls the underlying 
     `mbarrier.try_wait.parity` instruction until the barrier phase completes. 
@@ -793,7 +815,12 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
   let description = [{
     The `nvvm.mbarrier.test.wait` operation performs a non-blocking test for the 
     completion of a specific phase of an *mbarrier object*. It uses the default
-    `.acquire.cta` semantics.
+    `.acquire.cta` semantics. This acquire pattern establishes memory ordering for 
+    operations occurring in program order after this wait instruction by making 
+    operations from other threads visible to subsequent operations in the current 
+    thread. When this wait completes, it synchronizes with the corresponding release 
+    pattern from the `mbarrier.arrive` operation, establishing memory ordering within 
+    the CTA.
 
     This operation tests whether the mbarrier phase specified by the state operand 
     has completed. It is a non-blocking instruction that immediately returns the 
@@ -811,6 +838,23 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
     - `true`: The immediately preceding phase has completed
     - `false`: The phase is still incomplete (current phase)
 
+    **Memory ordering guarantees**: When this wait returns true, the following 
+    ordering guarantees hold:
+
+    1. All memory accesses (except async operations) requested prior to
+       `mbarrier.arrive` having release semantics by participating CTA threads
+       are visible to the executing thread.
+    2. All `cp.async` operations requested prior to `cp.async.mbarrier.arrive`
+       by participating CTA threads are visible to the executing thread.
+    3. All `cp.async.bulk` operations using the same mbarrier object requested
+       prior to `mbarrier.arrive` having release semantics by participating CTA
+       threads are visible to the executing thread.
+    4. Memory accesses requested after this wait are not visible to memory
+       accesses performed prior to `mbarrier.arrive` by other participating
+       threads.
+    5. No ordering guarantee exists for memory accesses by the same thread
+       between `mbarrier.arrive` and this wait.
+
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
   }];
   string llvmBuilder = [{

From 96326aff11282f54fb06ccf135fbf3fcb2e71c93 Mon Sep 17 00:00:00 2001
From: Stefan Mada <smada@nvidia.com>
Date: Thu, 4 Sep 2025 18:15:21 +0000
Subject: [PATCH 5/8] Clarified meaning of aligned barrier

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index adb39a746f4e6..e05947e51cc25 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -890,7 +890,7 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
     synchronization and communication within a CTA (Cooperative Thread Array) using 
     barrier ID 0. It is functionally equivalent to `nvvm.barrier` or `nvvm.barrier id=0`. 
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier)
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
   }];
 
   let assemblyFormat = "attr-dict";
@@ -924,10 +924,11 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
     When a barrier completes, the waiting threads are restarted without delay, and 
     the barrier is reinitialized so that it can be immediately reused.
 
-    This operation generates the `.aligned` version of the PTX barrier instruction, 
-    indicating that all threads in the CTA execute the same barrier instruction.
+    This operation generates an aligned barrier, indicating that all threads in the CTA 
+    will execute the same barrier instruction. Behavior is undefined if not all threads 
+    in the CTA reach this instruction.
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier)
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
   }];
 
   let arguments = (ins     

From 4b3318467706ac88f68e5d1805b49199c24c45db Mon Sep 17 00:00:00 2001
From: Stefan Mada <smada@nvidia.com>
Date: Thu, 4 Sep 2025 18:31:40 +0000
Subject: [PATCH 6/8] Explained .release.cta sem and expect-tx

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 28 ++++++++++++++++-----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index e05947e51cc25..8c4de1d87fab3 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -590,7 +590,12 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
   let summary = "MBarrier Arrive Operation";
   let description = [{
     The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the 
-    *mbarrier object* at the specified address. Uses the default `.release.cta` semantics.
+    *mbarrier object* at the specified address. Uses the default `.release.cta` semantics. 
+    This release pattern establishes memory ordering for operations occurring in program 
+    order before this arrive instruction by making operations from the current thread 
+    visible to subsequent operations in other threads within the CTA. When other threads 
+    perform corresponding acquire operations (like 'mbarrier.test.wait'), they synchronize 
+    with this release pattern.
 
     This operation causes the executing thread to signal its arrival at the barrier.
     The operation returns an opaque value that captures the phase of the 
@@ -632,7 +637,12 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
   let description = [{
     The `nvvm.mbarrier.arrive.nocomplete` operation performs an arrive-on operation 
     on the *mbarrier object* with the guarantee that it will not cause the barrier to 
-    complete its current phase. Uses the default `.release.cta` semantics.
+    complete its current phase. Uses the default `.release.cta` semantics. This release 
+    pattern establishes memory ordering for operations occurring in program order before 
+    this arrive instruction by making operations from the current thread visible to 
+    subsequent operations in other threads within the CTA. When other threads perform 
+    corresponding acquire operations (like 'mbarrier.test.wait'), they synchronize with 
+    this release pattern.
 
     This operation causes the executing thread to signal its arrival at the barrier 
     with a specified count, but ensures that the barrier phase will not complete as 
@@ -675,11 +685,17 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
   let description = [{
     The `nvvm.mbarrier.arrive.expect_tx` operation performs an expect-tx operation 
     followed by an arrive-on operation on the *mbarrier object*. Uses the default 
-    `.release.cta` semantics.
+    `.release.cta` semantics. This release pattern establishes memory ordering for 
+    operations occurring in program order before this arrive instruction by making 
+    operations from the current thread visible to subsequent operations in other 
+    threads within the CTA. When other threads perform corresponding acquire operations 
+    (like 'mbarrier.test.wait'), they synchronize with this release pattern.
 
     This operation first performs an expect-tx operation with the specified transaction 
     count, then performs an arrive-on operation with an implicit count of 1. The 
-    expect-tx operation updates the expected transaction count for the barrier.
+    expect-tx operation increases the tx-count of the *mbarrier object* by the specified 
+    expectCount value, setting the current phase to expect and tracks the completion 
+    of additional asynchronous transactions.
 
     The operation takes the following operands:
     - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
@@ -747,7 +763,7 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
        are visible to the executing thread.
     2. All `cp.async` operations requested prior to `cp.async.mbarrier.arrive`
        by participating CTA threads are visible to the executing thread.
-    3. All `cp.async.bulk` operations using the same mbarrier object requested
+    3. All `cp.async.bulk` operations using the same *mbarrier object* requested
        prior to `mbarrier.arrive` having release semantics by participating CTA
        threads are visible to the executing thread.
     4. Memory accesses requested after this wait are not visible to memory
@@ -846,7 +862,7 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
        are visible to the executing thread.
     2. All `cp.async` operations requested prior to `cp.async.mbarrier.arrive`
        by participating CTA threads are visible to the executing thread.
-    3. All `cp.async.bulk` operations using the same mbarrier object requested
+    3. All `cp.async.bulk` operations using the same *mbarrier object* requested
        prior to `mbarrier.arrive` having release semantics by participating CTA
        threads are visible to the executing thread.
     4. Memory accesses requested after this wait are not visible to memory

From e9522e84269d7d2ae1ed248e7ac07fad60a22fc7 Mon Sep 17 00:00:00 2001
From: Stefan Mada <smada@nvidia.com>
Date: Thu, 4 Sep 2025 20:12:59 +0000
Subject: [PATCH 7/8] Made wording changes for barrier and acquire.cta sem

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 8c4de1d87fab3..016f6087f9c28 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -736,7 +736,7 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
     test for the completion of a specific phase of an *mbarrier object* using phase 
     parity. It uses the default `.acquire.cta` semantics. This acquire pattern 
     establishes memory ordering for operations occurring in program order after this 
-    wait instruction by making operations from other threads visible to subsequent 
+    wait instruction by making operations from other threads in the CTA visible to subsequent 
     operations in the current thread. When this wait completes, it synchronizes with 
     the corresponding release pattern from the `mbarrier.arrive` operation, establishing 
     memory ordering within the CTA.
@@ -833,7 +833,7 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
     completion of a specific phase of an *mbarrier object*. It uses the default
     `.acquire.cta` semantics. This acquire pattern establishes memory ordering for 
     operations occurring in program order after this wait instruction by making 
-    operations from other threads visible to subsequent operations in the current 
+    operations from other threads in the CTA visible to subsequent operations in the current 
     thread. When this wait completes, it synchronizes with the corresponding release 
     pattern from the `mbarrier.arrive` operation, establishing memory ordering within 
     the CTA.
@@ -941,8 +941,8 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
     the barrier is reinitialized so that it can be immediately reused.
 
     This operation generates an aligned barrier, indicating that all threads in the CTA 
-    will execute the same barrier instruction. Behavior is undefined if not all threads 
-    in the CTA reach this instruction.
+    will execute the same barrier instruction. Behavior is undefined if all threads in the 
+    CTA do not reach this instruction.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
   }];

From 25c9ee05093e5d1be77c4e64085717fb729cdd79 Mon Sep 17 00:00:00 2001
From: Stefan Mada <smada@nvidia.com>
Date: Fri, 5 Sep 2025 16:22:28 +0000
Subject: [PATCH 8/8] Reverted type changes to move to another PR

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 016f6087f9c28..50b493c7592fe 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -585,7 +585,7 @@ def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">,
 }
 
 def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
-  Results<(outs I64:$res)>,
+  Results<(outs LLVM_Type:$res)>,
   Arguments<(ins LLVM_AnyPointer:$addr)> {
   let summary = "MBarrier Arrive Operation";
   let description = [{
@@ -615,7 +615,7 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
 }
 
 def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
-  Results<(outs I64:$res)>,
+  Results<(outs LLVM_Type:$res)>,
   Arguments<(ins LLVM_PointerShared:$addr)> {
   let summary = "Shared MBarrier Arrive Operation";
   let description = [{
@@ -631,7 +631,7 @@ def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
 }
 
 def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
-  Results<(outs I64:$res)>,
+  Results<(outs LLVM_Type:$res)>,
   Arguments<(ins LLVM_AnyPointer:$addr, I32:$count)> {
   let summary = "MBarrier Arrive No-Complete Operation";
   let description = [{
@@ -664,7 +664,7 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
 }
 
 def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.shared">,
-  Results<(outs I64:$res)>,
+  Results<(outs LLVM_Type:$res)>,
   Arguments<(ins LLVM_PointerShared:$addr, I32:$count)> {
   let summary = "Shared MBarrier Arrive No-Complete Operation";
   let description = [{
@@ -1335,7 +1335,7 @@ def NVVM_VoteSyncOp
 
 def NVVM_SyncWarpOp :
   NVVM_Op<"bar.warp.sync">,
-  Arguments<(ins I32:$mask)> {
+  Arguments<(ins LLVM_Type:$mask)> {
   let summary = "Warp Barrier Synchronization Op";
   let description = [{
     The `nvvm.bar.warp.sync` operation performs barrier synchronization for threads