Skip to content

Commit 9e53ef3

Browse files
authored
[MLIR][NVVM] Update mbarrier.arrive.* Op (#168758)
This patch updates the mbarrier.arrive.* family of Ops to include all features added up-to Blackwell. * Update the `mbarrier.arrive` Op to include shared_cluster memory space, cta/cluster scope and an option to lower using relaxed semantics. * An `arrive_drop` variant is added for both the `arrive` and `arrive.nocomplete` operations. * Updates for expect_tx and complete_tx operations. * Verifier checks are added wherever appropriate. * lit tests are added to verify the lowering to the intrinsics. TODO: * Updates for the remaining mbarrier family will be done in subsequent PRs. (mainly, arrive.expect-tx, test_wait and try_waits) Signed-off-by: Durgadoss R <[email protected]>
1 parent 4bc654d commit 9e53ef3

File tree

9 files changed

+676
-58
lines changed

9 files changed

+676
-58
lines changed

flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -906,7 +906,7 @@ CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
906906
mlir::Value barrier = convertPtrToNVVMSpace(
907907
builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
908908
return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier)
909-
.getResult();
909+
.getResult(0);
910910
}
911911

912912
// BARRIER_ARRIBVE_CNT

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

Lines changed: 176 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -638,9 +638,76 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
638638
}];
639639
}
640640

641-
def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
642-
Results<(outs I64:$res)>,
643-
Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> {
641+
def NVVM_MBarrierExpectTxOp : NVVM_Op<"mbarrier.expect_tx"> {
642+
let summary = "MBarrier expect-tx Operation";
643+
let description = [{
644+
The `nvvm.mbarrier.expect_tx` operation increases the transaction count
645+
of the mbarrier located at `addr` by `txcount` amount. The `scope`
646+
specifies the set of threads that can directly observe the memory
647+
synchronizing effect of the `mbarrier.expect_tx` operation. `CTA`
648+
and `CLUSTER` are the only allowed values for `scope`.
649+
650+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx)
651+
}];
652+
653+
let arguments = (ins
654+
AnyTypeOf<[LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
655+
I32:$txcount,
656+
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope);
657+
658+
let assemblyFormat = "$addr `,` $txcount attr-dict `:` type(operands)";
659+
660+
let hasVerifier = 1;
661+
662+
let extraClassDeclaration = [{
663+
static mlir::NVVM::IDArgPair
664+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
665+
llvm::IRBuilderBase& builder);
666+
}];
667+
668+
string llvmBuilder = [{
669+
auto [id, args] = NVVM::MBarrierExpectTxOp::getIntrinsicIDAndArgs(
670+
*op, moduleTranslation, builder);
671+
createIntrinsicCall(builder, id, args);
672+
}];
673+
}
674+
675+
def NVVM_MBarrierCompleteTxOp : NVVM_Op<"mbarrier.complete_tx"> {
676+
let summary = "MBarrier complete-tx Operation";
677+
let description = [{
678+
The `nvvm.mbarrier.complete_tx` operation decrements the transaction
679+
count of the *mbarrier object* at `addr` by `txcount`. It also signals
680+
the completion of asynchronous transactions that were tracked by the
681+
current phase. The `scope` specifies the set of threads that can directly
682+
observe the memory synchronizing effect of the `mbarrier.complete_tx`
683+
operation. `CTA` and `CLUSTER` are the only allowed values for `scope`.
684+
685+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx)
686+
}];
687+
688+
let arguments = (ins
689+
AnyTypeOf<[LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
690+
I32:$txcount,
691+
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope);
692+
693+
let assemblyFormat = "$addr `,` $txcount attr-dict `:` type(operands)";
694+
695+
let hasVerifier = 1;
696+
697+
let extraClassDeclaration = [{
698+
static mlir::NVVM::IDArgPair
699+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
700+
llvm::IRBuilderBase& builder);
701+
}];
702+
703+
string llvmBuilder = [{
704+
auto [id, args] = NVVM::MBarrierCompleteTxOp::getIntrinsicIDAndArgs(
705+
*op, moduleTranslation, builder);
706+
createIntrinsicCall(builder, id, args);
707+
}];
708+
}
709+
710+
def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive"> {
644711
let summary = "MBarrier Arrive Operation";
645712
let description = [{
646713
The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the
@@ -652,19 +719,40 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
652719
with this release pattern.
653720

654721
This operation causes the executing thread to signal its arrival at the barrier.
655-
The operation returns an opaque value that captures the phase of the
656-
*mbarrier object* prior to the arrive-on operation. The contents of this state
657-
value are implementation-specific.
658722

659-
The operation takes the following operand:
723+
- `res`: When the `space` is not shared_cluster, this operation returns an
724+
opaque 64-bit value capturing the phase of the *mbarrier object* prior to
725+
the arrive-on operation. The contents of this return value are
726+
implementation-specific. An *mbarrier object* located in the shared_cluster
727+
space cannot return a value.
728+
729+
The operation takes the following operands:
660730
- `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
661-
must be a pointer to generic or shared::cta memory. When it is generic, the
662-
underlying address must be within the shared::cta memory space; otherwise
663-
the behavior is undefined.
731+
must be a pointer to generic or shared_cta or shared_cluster memory. When it
732+
is generic, the underlying address must be within the shared_cta memory space;
733+
otherwise the behavior is undefined.
734+
- `count`: This specifies the amount by which the pending arrival count is
735+
decremented. If the `count` argument is not specified, the pending arrival
736+
count is decremented by 1.
737+
- `scope`: This specifies the set of threads that directly observe the memory
738+
synchronizing effect of the `mbarrier.arrive` operation.
739+
- `space`: This indicates the memory space where the mbarrier object resides.
740+
- `relaxed`: When set to true, the `arrive` operation has relaxed memory semantics
741+
and does not provide any ordering or visibility guarantees.
664742

665743
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
666744
}];
667-
let assemblyFormat = "$addr attr-dict `:` type($addr) `->` type($res)";
745+
746+
let results = (outs Optional<I64>:$res);
747+
let arguments = (ins
748+
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
749+
Optional<I32>:$count,
750+
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
751+
DefaultValuedAttr<BoolAttr, "false">:$relaxed);
752+
753+
let assemblyFormat = "$addr (`,` $count^)? attr-dict `:` type($addr) (`->` type($res)^)?";
754+
755+
let hasVerifier = 1;
668756

669757
let extraClassDeclaration = [{
670758
static mlir::NVVM::IDArgPair
@@ -675,7 +763,54 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
675763
string llvmBuilder = [{
676764
auto [id, args] = NVVM::MBarrierArriveOp::getIntrinsicIDAndArgs(
677765
*op, moduleTranslation, builder);
678-
$res = createIntrinsicCall(builder, id, args);
766+
767+
int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
768+
if (addrSpace != static_cast<unsigned>(NVVM::NVVMMemorySpace::SharedCluster))
769+
$res = createIntrinsicCall(builder, id, args);
770+
else
771+
createIntrinsicCall(builder, id, args);
772+
}];
773+
}
774+
775+
def NVVM_MBarrierArriveDropOp : NVVM_Op<"mbarrier.arrive_drop"> {
776+
let summary = "MBarrier Arrive-Drop Operation";
777+
let description = [{
778+
The `nvvm.mbarrier.arrive_drop` operation decrements the expected arrival
779+
count of the *mbarrier object* by `count` and then performs an arrive-on
780+
operation. When `count` is not specified, it defaults to 1. The decrement
781+
of the expected arrival count applies to all the subsequent phases of the
782+
*mbarrier object*. The remaining semantics are identical to those of the
783+
`nvvm.mbarrier.arrive` operation.
784+
785+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
786+
}];
787+
788+
let results = (outs Optional<I64>:$res);
789+
let arguments = (ins
790+
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
791+
Optional<I32>:$count,
792+
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
793+
DefaultValuedAttr<BoolAttr, "false">:$relaxed);
794+
795+
let assemblyFormat = "$addr (`,` $count^)? attr-dict `:` type($addr) (`->` type($res)^)?";
796+
797+
let hasVerifier = 1;
798+
799+
let extraClassDeclaration = [{
800+
static mlir::NVVM::IDArgPair
801+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
802+
llvm::IRBuilderBase& builder);
803+
}];
804+
805+
string llvmBuilder = [{
806+
auto [id, args] = NVVM::MBarrierArriveDropOp::getIntrinsicIDAndArgs(
807+
*op, moduleTranslation, builder);
808+
809+
int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
810+
if (addrSpace != static_cast<unsigned>(NVVM::NVVMMemorySpace::SharedCluster))
811+
$res = createIntrinsicCall(builder, id, args);
812+
else
813+
createIntrinsicCall(builder, id, args);
679814
}];
680815
}
681816

@@ -725,6 +860,35 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
725860
}];
726861
}
727862

863+
def NVVM_MBarrierArriveDropNocompleteOp : NVVM_Op<"mbarrier.arrive_drop.nocomplete">,
864+
Results<(outs I64:$res)>,
865+
Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
866+
I32:$count)> {
867+
let summary = "MBarrier Arrive-Drop No-Complete Operation";
868+
let description = [{
869+
The `nvvm.mbarrier.arrive_drop.nocomplete` operation decrements the expected
870+
arrival count of the *mbarrier object* by the amount `count` and then performs
871+
an arrive-on operation on the *mbarrier object* with the guarantee that it
872+
will not cause the barrier to complete its current phase.
873+
874+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
875+
}];
876+
877+
let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
878+
879+
let extraClassDeclaration = [{
880+
static mlir::NVVM::IDArgPair
881+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
882+
llvm::IRBuilderBase& builder);
883+
}];
884+
885+
string llvmBuilder = [{
886+
auto [id, args] = NVVM::MBarrierArriveDropNocompleteOp::getIntrinsicIDAndArgs(
887+
*op, moduleTranslation, builder);
888+
$res = createIntrinsicCall(builder, id, args);
889+
}];
890+
}
891+
728892
def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,
729893
Arguments<(ins
730894
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,

0 commit comments

Comments
 (0)