Skip to content

Commit 57730f6

Browse files
authored
[flang][cuda] Switch to inline ptx for barrier_arrive (#166261)
1 parent a3a99c3 commit 57730f6

File tree

2 files changed

+7
-8
lines changed

2 files changed

+7
-8
lines changed

flang/lib/Optimizer/Builder/IntrinsicCall.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3386,13 +3386,12 @@ IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
33863386
assert(args.size() == 2);
33873387
mlir::Value barrier = convertPtrToNVVMSpace(
33883388
builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
3389-
mlir::Value token = fir::AllocaOp::create(builder, loc, resultType);
3390-
// TODO: the MBarrierArriveExpectTxOp is not taking the state argument and
3391-
// currently just the sink symbol `_`.
3392-
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
3393-
mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1],
3394-
{});
3395-
return fir::LoadOp::create(builder, loc, token);
3389+
return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType},
3390+
{barrier, args[1]}, {},
3391+
"mbarrier.arrive.expect_tx.release."
3392+
"cta.shared::cta.b64 %0, [%1], %2;",
3393+
{})
3394+
.getResult(0);
33963395
}
33973396

33983397
// BARRIER_INIT (CUDA)

flang/test/Lower/CUDA/cuda-device-proc.cuf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,7 @@ end subroutine
444444

445445
! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
446446
! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
447-
! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32
447+
! CHECK: %{{.*}} = nvvm.inline_ptx "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %{{.*}}, [%{{.*}}], %{{.*}};" ro(%{{.*}}, %{{.*}} : !llvm.ptr<3>, i32) -> i64
448448

449449

450450
attributes(global) subroutine test_fence()

0 commit comments

Comments
 (0)