diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp index 4c0d266428632..18b56d384b479 100644 --- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -1309,34 +1309,22 @@ CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType, // THREADFENCE void CUDAIntrinsicLibrary::genThreadFence( llvm::ArrayRef args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); + assert(args.size() == 0); + mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::GPU); } // THREADFENCE_BLOCK void CUDAIntrinsicLibrary::genThreadFenceBlock( llvm::ArrayRef args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); + assert(args.size() == 0); + mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::CTA); } // THREADFENCE_SYSTEM void CUDAIntrinsicLibrary::genThreadFenceSystem( llvm::ArrayRef args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); + assert(args.size() == 0); + mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::SYS); } // TMA_BULK_COMMIT_GROUP diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 9f8f74a0c7b5e..3a255afd59263 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -22,9 +22,6 @@ attributes(global) subroutine devsub() call syncthreads() call syncwarp(1) - call threadfence() - call threadfence_block() - call threadfence_system() ret = syncthreads_and(1) res = syncthreads_and(tid > offset) ret = syncthreads_count(1) @@ -106,9 +103,6 @@ end ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc} ! CHECK: nvvm.barrier0 ! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32 -! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath : () -> () -! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath : () -> () -! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath : () -> () ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath : (i32) -> i32 ! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref ! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref diff --git a/flang/test/Lower/CUDA/cuda-synchronization.cuf b/flang/test/Lower/CUDA/cuda-synchronization.cuf new file mode 100644 index 0000000000000..6e2e23423c360 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-synchronization.cuf @@ -0,0 +1,14 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +! Test CUDA Fortran instrinsics lowerings for synchronization. + +attributes(global) subroutine sync() + call threadfence() + call threadfence_block() + call threadfence_system() +end subroutine + +! CHECK-LABEL: func.func @_QPsync() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK: nvvm.memory.barrier +! CHECK: nvvm.memory.barrier +! CHECK: nvvm.memory.barrier