diff --git a/flang/include/flang/Optimizer/Transforms/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h similarity index 100% rename from flang/include/flang/Optimizer/Transforms/CUFCommon.h rename to flang/include/flang/Optimizer/Builder/CUFCommon.h diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index d06587c57d44b..9a31ffa2e9471 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -140,6 +140,17 @@ def cuf_DeallocateOp : cuf_Op<"deallocate", let hasVerifier = 1; } +def cuf_SyncDescriptorOp : cuf_Op<"sync_descriptor", []> { + let summary = + "Synchronize the host and device descriptor of a Fortran pointer"; + + let arguments = (ins SymbolRefAttr:$globalName); + + let assemblyFormat = [{ + $globalName attr-dict + }]; +} + def cuf_DataTransferOp : cuf_Op<"data_transfer", []> { let summary = "Represent a data transfer between host and device memory"; diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h index 55878aaac57fb..0ee7feca10e44 100644 --- a/flang/include/flang/Runtime/CUDA/descriptor.h +++ b/flang/include/flang/Runtime/CUDA/descriptor.h @@ -33,6 +33,10 @@ void *RTDECL(CUFGetDeviceAddress)( void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src, const char *sourceFile = nullptr, int sourceLine = 0); +/// Get the device address of registered with the \p hostPtr and sync them. +void RTDECL(CUFSyncGlobalDescriptor)( + void *hostPtr, const char *sourceFile = nullptr, int sourceLine = 0); + } // extern "C" } // namespace Fortran::runtime::cuda diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp index fb8380ac7e8c5..4c64870675816 100644 --- a/flang/lib/Lower/Allocatable.cpp +++ b/flang/lib/Lower/Allocatable.cpp @@ -22,12 +22,14 @@ #include "flang/Lower/PFTBuilder.h" #include "flang/Lower/Runtime.h" #include "flang/Lower/StatementContext.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/FatalError.h" #include "flang/Optimizer/Support/InternalNames.h" #include "flang/Parser/parse-tree.h" @@ -1086,6 +1088,22 @@ bool Fortran::lower::isArraySectionWithoutVectorSubscript( !Fortran::evaluate::HasVectorSubscript(expr); } +static void genCUFPointerSync(const mlir::Value box, + fir::FirOpBuilder &builder) { + if (auto declareOp = box.getDefiningOp()) { + if (auto addrOfOp = declareOp.getMemref().getDefiningOp()) { + auto mod = addrOfOp->getParentOfType(); + if (auto globalOp = + mod.lookupSymbol(addrOfOp.getSymbol())) { + if (cuf::isRegisteredDeviceGlobal(globalOp)) { + builder.create(box.getLoc(), + addrOfOp.getSymbol()); + } + } + } + } +} + void Fortran::lower::associateMutableBox( Fortran::lower::AbstractConverter &converter, mlir::Location loc, const fir::MutableBoxValue &box, const Fortran::lower::SomeExpr &source, @@ -1098,6 +1116,7 @@ void Fortran::lower::associateMutableBox( if (converter.getLoweringOptions().getLowerToHighLevelFIR()) { fir::ExtendedValue rhs = converter.genExprAddr(loc, source, stmtCtx); fir::factory::associateMutableBox(builder, loc, box, rhs, lbounds); + genCUFPointerSync(box.getAddr(), builder); return; } // The right hand side is not be evaluated into a temp. Array sections can diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt index 05164d41a4cb5..a824d70fdb5c7 100644 --- a/flang/lib/Optimizer/Builder/CMakeLists.txt +++ b/flang/lib/Optimizer/Builder/CMakeLists.txt @@ -5,6 +5,7 @@ add_flang_library(FIRBuilder BoxValue.cpp Character.cpp Complex.cpp + CUFCommon.cpp DoLoopHelper.cpp FIRBuilder.cpp HLFIRTools.cpp diff --git a/flang/lib/Optimizer/Transforms/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp similarity index 97% rename from flang/lib/Optimizer/Transforms/CUFCommon.cpp rename to flang/lib/Optimizer/Builder/CUFCommon.cpp index bbe33217e8f45..81a8a90ce394e 100644 --- a/flang/lib/Optimizer/Transforms/CUFCommon.cpp +++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "flang/Optimizer/Transforms/CUFCommon.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 9eafa4ec234bd..d20d3bc4108ce 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -9,7 +9,6 @@ add_flang_library(FIRTransforms CompilerGeneratedNames.cpp ConstantArgumentGlobalisation.cpp ControlFlowConverter.cpp - CUFCommon.cpp CUFAddConstructor.cpp CUFDeviceGlobal.cpp CUFOpConversion.cpp diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp index 9591f48c5d417..97551595db039 100644 --- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp +++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/Builder/Todo.h" @@ -19,7 +20,6 @@ #include "flang/Optimizer/Dialect/FIROpsSupport.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Support/DataLayout.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/registration.h" #include "flang/Runtime/entry-names.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp index 07cc1f3b4b51c..2e6c272fa9089 100644 --- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp +++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp @@ -7,12 +7,12 @@ //===----------------------------------------------------------------------===// #include "flang/Common/Fortran.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/InternalNames.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/allocatable.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index de5c51556eecf..f08f9e412b885 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -8,6 +8,7 @@ #include "flang/Optimizer/Transforms/CUFOpConversion.h" #include "flang/Common/Fortran.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/CodeGen/TypeConverter.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" @@ -15,7 +16,6 @@ #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/DataLayout.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/allocatable.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" @@ -788,6 +788,45 @@ struct CUFLaunchOpConversion const mlir::SymbolTable &symTab; }; +struct CUFSyncDescriptorOpConversion + : public mlir::OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + CUFSyncDescriptorOpConversion(mlir::MLIRContext *context, + const mlir::SymbolTable &symTab) + : OpRewritePattern(context), symTab{symTab} {} + + mlir::LogicalResult + matchAndRewrite(cuf::SyncDescriptorOp op, + mlir::PatternRewriter &rewriter) const override { + auto mod = op->getParentOfType(); + fir::FirOpBuilder builder(rewriter, mod); + mlir::Location loc = op.getLoc(); + + auto globalOp = mod.lookupSymbol(op.getGlobalName()); + if (!globalOp) + return mlir::failure(); + + auto hostAddr = builder.create( + loc, fir::ReferenceType::get(globalOp.getType()), op.getGlobalName()); + mlir::func::FuncOp callee = + fir::runtime::getRuntimeFunc(loc, + builder); + auto fTy = callee.getFunctionType(); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(2)); + llvm::SmallVector args{fir::runtime::createArguments( + builder, loc, fTy, hostAddr, sourceFile, sourceLine)}; + builder.create(loc, callee, args); + op.erase(); + return mlir::success(); + } + +private: + const mlir::SymbolTable &symTab; +}; + class CUFOpConversion : public fir::impl::CUFOpConversionBase { public: void runOnOperation() override { @@ -851,7 +890,8 @@ void cuf::populateCUFToFIRConversionPatterns( CUFFreeOpConversion>(patterns.getContext()); patterns.insert(patterns.getContext(), symtab, &dl, &converter); - patterns.insert(patterns.getContext(), symtab); + patterns.insert( + patterns.getContext(), symtab); } void cuf::populateFIRCUFConversionPatterns(const mlir::SymbolTable &symtab, diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp index d3567f453fceb..fa6a7b23624e8 100644 --- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp @@ -24,6 +24,7 @@ #include "flang/Common/Fortran.h" #include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/LowLevelIntrinsics.h" #include "flang/Optimizer/Builder/Todo.h" @@ -31,7 +32,6 @@ #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/FIRContext.h" #include "flang/Optimizer/HLFIR/HLFIRDialect.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Optimizer/Transforms/Passes.h" #include "flang/Optimizer/Transforms/Utils.h" #include "flang/Runtime/entry-names.h" diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp index 391c47e84241d..947eeb66aa3d6 100644 --- a/flang/runtime/CUDA/descriptor.cpp +++ b/flang/runtime/CUDA/descriptor.cpp @@ -46,6 +46,13 @@ void RTDEF(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src, (void *)dst, (const void *)src, count, cudaMemcpyHostToDevice)); } +void RTDEF(CUFSyncGlobalDescriptor)( + void *hostPtr, const char *sourceFile, int sourceLine) { + void *devAddr{RTNAME(CUFGetDeviceAddress)(hostPtr, sourceFile, sourceLine)}; + RTNAME(CUFDescriptorSync) + ((Descriptor *)devAddr, (Descriptor *)hostPtr, sourceFile, sourceLine); +} + RT_EXT_API_GROUP_END } } // namespace Fortran::runtime::cuda diff --git a/flang/test/Fir/CUDA/cuda-sync-desc.mlir b/flang/test/Fir/CUDA/cuda-sync-desc.mlir new file mode 100644 index 0000000000000..20b317f34a7f2 --- /dev/null +++ b/flang/test/Fir/CUDA/cuda-sync-desc.mlir @@ -0,0 +1,20 @@ +// RUN: fir-opt --cuf-convert %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f64 = dense<64> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git f37e52237791f58438790c77edeb8de08f692987)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda} : !fir.box>> { + %0 = fir.zero_bits !fir.ptr> + %c0 = arith.constant 0 : index + %1 = fir.shape %c0 : (index) -> !fir.shape<1> + %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.ptr>, !fir.shape<1>) -> !fir.box>> + fir.has_value %2 : !fir.box>> + } + func.func @_QQmain() { + cuf.sync_descriptor @_QMdevptrEdev_ptr + return + } +} + +// CHECK-LABEL: func.func @_QQmain() +// CHECK: %[[HOST_ADDR:.*]] = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref>>> +// CHECK: %[[HOST_ADDR_PTR:.*]] = fir.convert %[[HOST_ADDR]] : (!fir.ref>>>) -> !fir.llvm_ptr +// CHECK: fir.call @_FortranACUFSyncGlobalDescriptor(%[[HOST_ADDR_PTR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) diff --git a/flang/test/Lower/CUDA/cuda-pointer-sync.cuf b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf new file mode 100644 index 0000000000000..e17869b2d6357 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf @@ -0,0 +1,17 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +module devptr + real, device, pointer, dimension(:) :: dev_ptr +end module + +use devptr +real, device, target, dimension(4) :: a_dev +a_dev = 42.0 +dev_ptr => a_dev +end + +! CHECK: fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda} : !fir.box>> +! CHECK-LABEL: func.func @_QQmain() +! CHECK: fir.embox +! CHECK: fir.store +! CHECK: cuf.sync_descriptor @_QMdevptrEdev_ptr