diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 9c9c0609f4fc3..e2ea89483ef11 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -185,6 +185,7 @@ struct IntrinsicLibrary { mlir::Value genAnint(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef); mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef); + mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genCommandArgumentCount(mlir::Type, llvm::ArrayRef); mlir::Value genAsind(mlir::Type, llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 6a343645ab878..63c013dda95e6 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -44,6 +44,7 @@ #include "flang/Runtime/iostat-consts.h" #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "llvm/Support/CommandLine.h" @@ -51,7 +52,6 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include // temporary -- only used in genIeeeGetOrSetModesOrStatus -#include #include #define DEBUG_TYPE "flang-lower-intrinsic" @@ -147,6 +147,10 @@ static constexpr IntrinsicHandler handlers[]{ {"atan2pi", &I::genAtanpi}, {"atand", &I::genAtand}, {"atanpi", &I::genAtanpi}, + {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, + {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, + {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, + {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, {"bessel_jn", &I::genBesselJn, {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}}, @@ -2574,6 +2578,26 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType, return builder.create(loc, atan, factor); } +static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc, + mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0, + mlir::Value arg1) { + auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + arg0 = builder.createConvert(loc, llvmPointerType, arg0); + return builder.create( + loc, binOp, arg0, arg1, mlir::LLVM::AtomicOrdering::seq_cst); +} + +mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 2); + + mlir::LLVM::AtomicBinOp binOp = + mlir::isa(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::add + : mlir::LLVM::AtomicBinOp::fadd; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + // ASSOCIATED fir::ExtendedValue IntrinsicLibrary::genAssociated(mlir::Type resultType, diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 3d487fd000a09..53b6beaaf1ad8 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -92,5 +92,31 @@ attributes(device) subroutine threadfence_system() end function end interface public :: __fadd_ru - + + ! Atomic Operations + + interface atomicadd + attributes(device) pure integer function atomicaddi(address, val) + !dir$ ignore_tkr (d) address, (d) val + integer, intent(inout) :: address + integer, value :: val + end function + attributes(device) pure real function atomicaddf(address, val) + !dir$ ignore_tkr (d) address, (d) val + real, intent(inout) :: address + real, value :: val + end function + attributes(device) pure real*8 function atomicaddd(address, val) + !dir$ ignore_tkr (d) address, (d) val + real*8, intent(inout) :: address + real*8, value :: val + end function + attributes(device) pure integer(8) function atomicaddl(address, val) + !dir$ ignore_tkr (d) address, (d) val + integer(8), intent(inout) :: address + integer(8), value :: val + end function + end interface +public :: atomicadd + end module diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 2042bbbe19650..661e5728bf85b 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -5,6 +5,10 @@ attributes(global) subroutine devsub() implicit none integer :: ret + real(4) :: af + real(8) :: ad + integer(4) :: ai + integer(8) :: al call syncthreads() call syncwarp(1) @@ -14,6 +18,11 @@ attributes(global) subroutine devsub() ret = syncthreads_and(1) ret = syncthreads_count(1) ret = syncthreads_or(1) + + ai = atomicadd(ai, 1_4) + al = atomicadd(al, 1_8) + af = atomicadd(af, 1.0_4) + ad = atomicadd(ad, 1.0_8) end ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc} @@ -25,6 +34,10 @@ end ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath : (i32) -> i32 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath : (i32) -> i32 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath : (i32) -> i32 +! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 +! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 +! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32 +! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64 ! CHECK: func.func private @llvm.nvvm.barrier0() ! CHECK: func.func private @__syncwarp(!fir.ref {cuf.data_attr = #cuf.cuda}) attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncwarp", fir.proc_attrs = #fir.proc_attrs} diff --git a/flang/test/Semantics/cuf-device-procedures01.cuf b/flang/test/Semantics/cuf-device-procedures01.cuf index b9918d8a4ae4c..92ee02bb3c64d 100644 --- a/flang/test/Semantics/cuf-device-procedures01.cuf +++ b/flang/test/Semantics/cuf-device-procedures01.cuf @@ -28,8 +28,17 @@ end ! CHECK: threadfence_system (Subroutine): Use from threadfence_system in cudadevice subroutine host() + real(4) :: af + real(8) :: ad + integer(4) :: ai + integer(8) :: al call syncthreads() + ai = atomicadd(ai, 1_4) + al = atomicadd(al, 1_8) + af = atomicadd(af, 1.0_4) + ad = atomicadd(ad, 1.0_8) end subroutine ! CHECK-LABEL: Subprogram scope: host +! CHECK: atomicadd, EXTERNAL: HostAssoc{{$}} ! CHECK: syncthreads, EXTERNAL: HostAssoc{{$}}