Skip to content

Commit 112bbdf

Browse files
committed
[MLIR][NVVM] Update mbarrier.arrive.expect_tx Op
This patch updates the mbarrier.arrive.expect_tx Op and also adds an Op for its arrive_drop version. * No change in the existing inline-asm lowering. This functionality continues to work as is. * An optional return value is added for shared_cta space. * The scope and semantics are added as attributes. * Inline-PTX lowering is available when `predicate` is provided. Otherwise, the Op lowers to intrinsics. * lit tests are added to verify the lowering to intrinsics. * Specific negative tests are added to check the invalid cases for inline-ptx lowering. Signed-off-by: Durgadoss R <[email protected]>
1 parent 4769122 commit 112bbdf

File tree

7 files changed

+398
-19
lines changed

7 files changed

+398
-19
lines changed

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

Lines changed: 84 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -889,10 +889,7 @@ def NVVM_MBarrierArriveDropNocompleteOp : NVVM_Op<"mbarrier.arrive_drop.nocomple
889889
}];
890890
}
891891

892-
def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,
893-
Arguments<(ins
894-
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
895-
I32:$txcount, PtxPredicate:$predicate)> {
892+
def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx"> {
896893
let summary = "MBarrier Arrive with Expected Transaction Count";
897894
let description = [{
898895
The `nvvm.mbarrier.arrive.expect_tx` operation performs an expect-tx operation
@@ -903,23 +900,98 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
903900
threads within the CTA. When other threads perform corresponding acquire operations
904901
(like 'mbarrier.test.wait'), they synchronize with this release pattern.
905902

906-
This operation first performs an expect-tx operation with the specified transaction
907-
count, then performs an arrive-on operation with an implicit count of 1. The
908-
expect-tx operation increases the tx-count of the *mbarrier object* by the specified
909-
expectCount value, setting the current phase to expect and tracks the completion
910-
of additional asynchronous transactions.
903+
This operation first performs an expect-tx operation with the specified transaction
904+
count, then performs an arrive-on operation with an implicit count of 1. The
905+
expect-tx operation increases the expect-count of the *mbarrier object* by the
906+
specified value (i.e. `txcount`), setting the current phase to expect and track
907+
the completion of additional asynchronous transactions.
911908

912909
The operation takes the following operands:
913910
- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic
914911
addressing, but the address must still be in the shared memory space.
915912
- `txcount`: An unsigned integer specifying the expected transaction count
916913
for the expect-tx operation. This represents the number of asynchronous transactions
917914
expected to complete before the barrier phase completes.
918-
- `predicate`: Optional predicate for conditional execution.
915+
- `scope`: This specifies the set of threads that directly observe the memory
916+
synchronizing effect of the `mbarrier.test.wait` operation.
917+
- `relaxed`: When set to true, the `arrive` operation has relaxed memory semantics
918+
and does not provide any ordering or visibility guarantees.
919+
- `predicate`: Optional predicate for conditional execution used only when lowering to
920+
inline-ptx.
919921

920-
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
922+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
923+
}];
924+
925+
let results = (outs Optional<I64>:$res);
926+
let arguments = (ins
927+
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
928+
I32:$txcount,
929+
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
930+
DefaultValuedAttr<BoolAttr, "false">:$relaxed,
931+
PtxPredicate:$predicate);
932+
933+
let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands) (`->` type($res)^)?";
934+
let hasVerifier = 1;
935+
936+
let extraClassDeclaration = [{
937+
bool hasIntrinsic() { return !getPredicate(); }
938+
939+
bool getAsmValues(RewriterBase &rewriter,
940+
llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues);
941+
942+
static mlir::NVVM::IDArgPair
943+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
944+
llvm::IRBuilderBase& builder);
945+
}];
946+
947+
string llvmBuilder = [{
948+
auto [id, args] = NVVM::MBarrierArriveExpectTxOp::getIntrinsicIDAndArgs(
949+
*op, moduleTranslation, builder);
950+
951+
int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
952+
if (addrSpace != static_cast<unsigned>(NVVM::NVVMMemorySpace::SharedCluster))
953+
$res = createIntrinsicCall(builder, id, args);
954+
else
955+
createIntrinsicCall(builder, id, args);
956+
}];
957+
}
958+
959+
def NVVM_MBarrierArriveDropExpectTxOp : NVVM_Op<"mbarrier.arrive_drop.expect_tx"> {
960+
let summary = "MBarrier arrive_drop with expected transaction count";
961+
let description = [{
962+
The `nvvm.mbarrier.arrive_drop.expect_tx` operation is similar to the
963+
`nvvm.mbarrier.arrive.expect_tx` operation except that it performs an
964+
`arrive_drop` operation instead of only an `arrive` operation.
965+
966+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
967+
}];
968+
969+
let results = (outs Optional<I64>:$res);
970+
let arguments = (ins
971+
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
972+
I32:$txcount,
973+
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
974+
DefaultValuedAttr<BoolAttr, "false">:$relaxed);
975+
976+
let assemblyFormat = "$addr `,` $txcount attr-dict `:` type(operands) (`->` type($res)^)?";
977+
let hasVerifier = 1;
978+
979+
let extraClassDeclaration = [{
980+
static mlir::NVVM::IDArgPair
981+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
982+
llvm::IRBuilderBase& builder);
983+
}];
984+
985+
string llvmBuilder = [{
986+
auto [id, args] = NVVM::MBarrierArriveDropExpectTxOp::getIntrinsicIDAndArgs(
987+
*op, moduleTranslation, builder);
988+
989+
int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
990+
if (addrSpace != static_cast<unsigned>(NVVM::NVVMMemorySpace::SharedCluster))
991+
$res = createIntrinsicCall(builder, id, args);
992+
else
993+
createIntrinsicCall(builder, id, args);
921994
}];
922-
let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
923995
}
924996

925997
def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -923,7 +923,11 @@ struct NVGPUMBarrierArriveExpectTxLowering
923923
adaptor.getMbarId(), rewriter);
924924
Value txcount = truncToI32(b, adaptor.getTxcount());
925925
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxOp>(
926-
op, barrier, txcount, adaptor.getPredicate());
926+
op, Type{}, // return-value is optional and is void by default
927+
barrier, txcount, // barrier and txcount
928+
NVVM::MemScopeKind::CTA, // default scope is CTA
929+
false, // relaxed-semantics is false
930+
adaptor.getPredicate());
927931
return success();
928932
}
929933
};

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,34 @@ LogicalResult MBarrierArriveDropOp::verify() {
274274
getRes());
275275
}
276276

277+
LogicalResult MBarrierArriveExpectTxOp::verify() {
278+
// The inline-ptx version of this Op does not support all features.
279+
// With predicate, this Op lowers to inline-ptx. So, verify and
280+
// error-out if there are unsupported features.
281+
if (getPredicate()) {
282+
if (getScope() != NVVM::MemScopeKind::CTA)
283+
return emitError("mbarrier scope must be CTA when using predicate");
284+
285+
if (isPtrInSharedClusterSpace(getAddr()))
286+
return emitError("mbarrier in shared_cluster space is not supported when "
287+
"using predicate");
288+
289+
if (getRes())
290+
return emitError("return-value is not supported when using predicate");
291+
292+
if (getRelaxed() == true)
293+
return emitError("mbarrier with relaxed semantics is not supported when "
294+
"using predicate");
295+
}
296+
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(),
297+
getRes());
298+
}
299+
300+
LogicalResult MBarrierArriveDropExpectTxOp::verify() {
301+
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(),
302+
getRes());
303+
}
304+
277305
LogicalResult MBarrierExpectTxOp::verify() {
278306
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope());
279307
}
@@ -2576,6 +2604,87 @@ mlir::NVVM::IDArgPair MBarrierArriveDropOp::getIntrinsicIDAndArgs(
25762604
return {id, {mbar, count}};
25772605
}
25782606

2607+
bool MBarrierArriveExpectTxOp::getAsmValues(
2608+
RewriterBase &rewriter,
2609+
llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>>
2610+
&asmValues) {
2611+
// Add all the operands but not the attrs to the asmValues list.
2612+
// The attrs here are used to generate the right variants for
2613+
// intrinsics-lowering. So, we ignore them while generating inline-PTX.
2614+
for (auto val : getOperands())
2615+
asmValues.push_back({val, mlir::NVVM::PTXRegisterMod::Read});
2616+
2617+
return false;
2618+
}
2619+
2620+
mlir::NVVM::IDArgPair MBarrierArriveExpectTxOp::getIntrinsicIDAndArgs(
2621+
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
2622+
auto thisOp = cast<NVVM::MBarrierArriveExpectTxOp>(op);
2623+
2624+
bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr());
2625+
bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER;
2626+
// bit-0: Space
2627+
// bit-1: Scope
2628+
size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0);
2629+
2630+
// clang-format off
2631+
static constexpr llvm::Intrinsic::ID IDs[] = {
2632+
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cta_space_cta,
2633+
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cta_space_cluster,
2634+
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cluster_space_cta,
2635+
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cluster_space_cluster};
2636+
static constexpr llvm::Intrinsic::ID relaxedIDs[] = {
2637+
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cta_space_cta,
2638+
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cta_space_cluster,
2639+
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cluster_space_cta,
2640+
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cluster_space_cluster};
2641+
// clang-format on
2642+
auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index];
2643+
2644+
// Tidy-up the Intrinsic Args
2645+
llvm::Value *txcount = mt.lookupValue(thisOp.getTxcount());
2646+
llvm::Value *mbar = mt.lookupValue(thisOp.getAddr());
2647+
bool needCast = isPtrInGenericSpace(thisOp.getAddr());
2648+
if (needCast)
2649+
mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared);
2650+
2651+
return {id, {mbar, txcount}};
2652+
}
2653+
2654+
mlir::NVVM::IDArgPair MBarrierArriveDropExpectTxOp::getIntrinsicIDAndArgs(
2655+
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
2656+
auto thisOp = cast<NVVM::MBarrierArriveDropExpectTxOp>(op);
2657+
2658+
bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr());
2659+
bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER;
2660+
// bit-0: Space
2661+
// bit-1: Scope
2662+
size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0);
2663+
2664+
// clang-format off
2665+
static constexpr llvm::Intrinsic::ID IDs[] = {
2666+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cta_space_cta,
2667+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cta_space_cluster,
2668+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cluster_space_cta,
2669+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cluster_space_cluster};
2670+
static constexpr llvm::Intrinsic::ID relaxedIDs[] = {
2671+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cta_space_cta,
2672+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cta_space_cluster,
2673+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cluster_space_cta,
2674+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cluster_space_cluster};
2675+
// clang-format on
2676+
auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index];
2677+
2678+
// Tidy-up the Intrinsic Args
2679+
llvm::Value *txcount = mt.lookupValue(thisOp.getTxcount());
2680+
llvm::Value *mbar = mt.lookupValue(thisOp.getAddr());
2681+
bool needCast = isPtrInGenericSpace(thisOp.getAddr());
2682+
if (needCast)
2683+
mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared);
2684+
2685+
return {id, {mbar, txcount}};
2686+
}
2687+
25792688
mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
25802689
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
25812690
auto thisOp = cast<NVVM::MBarrierArriveNocompleteOp>(op);

mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,13 @@ llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %cou
1616

1717
// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
1818
llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
19-
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r"
20-
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr<3>, i32
2119
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b"
2220
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1
2321
llvm.return
2422
}
2523

2624
// CHECK-LABEL: @init_mbarrier_arrive_expect_tx_generic
2725
llvm.func @init_mbarrier_arrive_expect_tx_generic(%barrier : !llvm.ptr, %txcount : i32, %pred : i1) {
28-
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r"
29-
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32
3026
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r,b"
3127
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr, i32, i1
3228
llvm.return
@@ -544,8 +540,8 @@ func.func @elect_one_leader_sync() {
544540

545541
// -----
546542

547-
// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
548-
llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) {
543+
// CHECK-LABEL: @test_nvvm_prefetch
544+
llvm.func @test_nvvm_prefetch(%desc : !llvm.ptr, %pred : i1) {
549545
//CHECK: nvvm.prefetch tensormap, %{{.*}}
550546
nvvm.prefetch tensormap, %desc : !llvm.ptr
551547
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b"
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
2+
3+
llvm.func @mbarrier_arrive_drop_expect_tx_generic(%barrier: !llvm.ptr, %txcount : i32) {
4+
// CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_generic(ptr %0, i32 %1) {
5+
// CHECK-NEXT: %3 = addrspacecast ptr %0 to ptr addrspace(3)
6+
// CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %3, i32 %1)
7+
// CHECK-NEXT: %5 = addrspacecast ptr %0 to ptr addrspace(3)
8+
// CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %5, i32 %1)
9+
// CHECK-NEXT: %7 = addrspacecast ptr %0 to ptr addrspace(3)
10+
// CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %7, i32 %1)
11+
// CHECK-NEXT: %9 = addrspacecast ptr %0 to ptr addrspace(3)
12+
// CHECK-NEXT: %10 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %9, i32 %1)
13+
// CHECK-NEXT: %11 = addrspacecast ptr %0 to ptr addrspace(3)
14+
// CHECK-NEXT: %12 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %11, i32 %1)
15+
// CHECK-NEXT: %13 = addrspacecast ptr %0 to ptr addrspace(3)
16+
// CHECK-NEXT: %14 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %13, i32 %1)
17+
// CHECK-NEXT: ret void
18+
// CHECK-NEXT: }
19+
%0 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr, i32 -> i64
20+
%1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr, i32 -> i64
21+
%2 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr, i32 -> i64
22+
23+
%3 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr, i32 -> i64
24+
%4 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr, i32 -> i64
25+
%5 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr, i32 -> i64
26+
llvm.return
27+
}
28+
29+
llvm.func @mbarrier_arrive_drop_expect_tx_shared(%barrier: !llvm.ptr<3>, %txcount : i32) {
30+
// CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_shared(ptr addrspace(3) %0, i32 %1) {
31+
// CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
32+
// CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
33+
// CHECK-NEXT: %5 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1)
34+
// CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
35+
// CHECK-NEXT: %7 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
36+
// CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1)
37+
// CHECK-NEXT: ret void
38+
// CHECK-NEXT: }
39+
%0 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr<3>, i32 -> i64
40+
%1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr<3>, i32 -> i64
41+
%2 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr<3>, i32 -> i64
42+
43+
%3 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<3>, i32 -> i64
44+
%4 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr<3>, i32 -> i64
45+
%5 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr<3>, i32 -> i64
46+
llvm.return
47+
}
48+
49+
llvm.func @mbarrier_arrive_drop_expect_tx_shared_cluster(%barrier: !llvm.ptr<7>, %txcount : i32) {
50+
// CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_shared_cluster(ptr addrspace(7) %0, i32 %1) {
51+
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
52+
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
53+
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1)
54+
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
55+
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
56+
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1)
57+
// CHECK-NEXT: ret void
58+
// CHECK-NEXT: }
59+
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr<7>, i32
60+
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr<7>, i32
61+
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr<7>, i32
62+
63+
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<7>, i32
64+
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr<7>, i32
65+
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr<7>, i32
66+
llvm.return
67+
}

0 commit comments

Comments
 (0)