Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 6 additions & 18 deletions flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1309,34 +1309,22 @@ CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
// THREADFENCE
void CUDAIntrinsicLibrary::genThreadFence(
llvm::ArrayRef<fir::ExtendedValue> args) {
constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl";
mlir::FunctionType funcType =
mlir::FunctionType::get(builder.getContext(), {}, {});
auto funcOp = builder.createFunction(loc, funcName, funcType);
llvm::SmallVector<mlir::Value> noArgs;
fir::CallOp::create(builder, loc, funcOp, noArgs);
assert(args.size() == 0);
mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::GPU);
}

// THREADFENCE_BLOCK
void CUDAIntrinsicLibrary::genThreadFenceBlock(
llvm::ArrayRef<fir::ExtendedValue> args) {
constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta";
mlir::FunctionType funcType =
mlir::FunctionType::get(builder.getContext(), {}, {});
auto funcOp = builder.createFunction(loc, funcName, funcType);
llvm::SmallVector<mlir::Value> noArgs;
fir::CallOp::create(builder, loc, funcOp, noArgs);
assert(args.size() == 0);
mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::CTA);
}

// THREADFENCE_SYSTEM
void CUDAIntrinsicLibrary::genThreadFenceSystem(
llvm::ArrayRef<fir::ExtendedValue> args) {
constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys";
mlir::FunctionType funcType =
mlir::FunctionType::get(builder.getContext(), {}, {});
auto funcOp = builder.createFunction(loc, funcName, funcType);
llvm::SmallVector<mlir::Value> noArgs;
fir::CallOp::create(builder, loc, funcOp, noArgs);
assert(args.size() == 0);
mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::SYS);
}

// TMA_BULK_COMMIT_GROUP
Expand Down
6 changes: 0 additions & 6 deletions flang/test/Lower/CUDA/cuda-device-proc.cuf
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ attributes(global) subroutine devsub()

call syncthreads()
call syncwarp(1)
call threadfence()
call threadfence_block()
call threadfence_system()
ret = syncthreads_and(1)
res = syncthreads_and(tid > offset)
ret = syncthreads_count(1)
Expand Down Expand Up @@ -106,9 +103,6 @@ end
! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
! CHECK: nvvm.barrier0
! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32
! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
Expand Down
14 changes: 14 additions & 0 deletions flang/test/Lower/CUDA/cuda-synchronization.cuf
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s

! Test CUDA Fortran instrinsics lowerings for synchronization.

attributes(global) subroutine sync()
call threadfence()
call threadfence_block()
call threadfence_system()
end subroutine

! CHECK-LABEL: func.func @_QPsync() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
! CHECK: nvvm.memory.barrier <gpu>
! CHECK: nvvm.memory.barrier <cta>
! CHECK: nvvm.memory.barrier <sys>