diff --git a/flang/include/flang/Optimizer/Transforms/CUFCommon.h b/flang/include/flang/Optimizer/Transforms/CUFCommon.h index f019d1893bda4..df1b709dc8608 100644 --- a/flang/include/flang/Optimizer/Transforms/CUFCommon.h +++ b/flang/include/flang/Optimizer/Transforms/CUFCommon.h @@ -9,6 +9,7 @@ #ifndef FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_ #define FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_ +#include "flang/Optimizer/Dialect/FIROps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/IR/BuiltinOps.h" @@ -21,6 +22,7 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod, mlir::SymbolTable &symTab); bool isInCUDADeviceContext(mlir::Operation *op); +bool isRegisteredDeviceGlobal(fir::GlobalOp op); } // namespace cuf diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp index 73a46843f0320..9591f48c5d417 100644 --- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp +++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp @@ -106,7 +106,8 @@ struct CUFAddConstructor mlir::func::FuncOp func; switch (attr.getValue()) { - case cuf::DataAttribute::Device: { + case cuf::DataAttribute::Device: + case cuf::DataAttribute::Constant: { func = fir::runtime::getRuntimeFunc( loc, builder); auto fTy = func.getFunctionType(); diff --git a/flang/lib/Optimizer/Transforms/CUFCommon.cpp b/flang/lib/Optimizer/Transforms/CUFCommon.cpp index 5b7631bbacb5f..bbe33217e8f45 100644 --- a/flang/lib/Optimizer/Transforms/CUFCommon.cpp +++ b/flang/lib/Optimizer/Transforms/CUFCommon.cpp @@ -43,3 +43,14 @@ bool cuf::isInCUDADeviceContext(mlir::Operation *op) { } return false; } + +bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) { + if (op.getConstant()) + return false; + auto attr = op.getDataAttr(); + if (attr && (*attr == cuf::DataAttribute::Device || + *attr == cuf::DataAttribute::Managed || + *attr == cuf::DataAttribute::Constant)) + return true; + return false; +} diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp index 714b0b291be1e..18150c4e595d4 100644 --- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp +++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp @@ -18,6 +18,7 @@ #include "mlir/IR/SymbolTable.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" +#include "llvm/ADT/DenseSet.h" namespace fir { #define GEN_PASS_DEF_CUFDEVICEGLOBAL @@ -27,36 +28,30 @@ namespace fir { namespace { static void processAddrOfOp(fir::AddrOfOp addrOfOp, - mlir::SymbolTable &symbolTable, bool onlyConstant) { + mlir::SymbolTable &symbolTable, + llvm::DenseSet &candidates) { if (auto globalOp = symbolTable.lookup( addrOfOp.getSymbol().getRootReference().getValue())) { - bool isCandidate{(onlyConstant ? globalOp.getConstant() : true) && - !globalOp.getDataAttr()}; - if (isCandidate) - globalOp.setDataAttrAttr(cuf::DataAttributeAttr::get( - addrOfOp.getContext(), globalOp.getConstant() - ? cuf::DataAttribute::Constant - : cuf::DataAttribute::Device)); + // TO DO: limit candidates to non-scalars. Scalars appear to have been + // folded in already. + if (globalOp.getConstant()) { + candidates.insert(globalOp); + } } } -static void prepareImplicitDeviceGlobals(mlir::func::FuncOp funcOp, - mlir::SymbolTable &symbolTable, - bool onlyConstant = true) { +static void +prepareImplicitDeviceGlobals(mlir::func::FuncOp funcOp, + mlir::SymbolTable &symbolTable, + llvm::DenseSet &candidates) { + auto cudaProcAttr{ funcOp->getAttrOfType(cuf::getProcAttrName())}; - if (!cudaProcAttr || cudaProcAttr.getValue() == cuf::ProcAttribute::Host) { - // Look for globlas in CUF KERNEL DO operations. - for (auto cufKernelOp : funcOp.getBody().getOps()) { - cufKernelOp.walk([&](fir::AddrOfOp addrOfOp) { - processAddrOfOp(addrOfOp, symbolTable, onlyConstant); - }); - } - return; + if (cudaProcAttr && cudaProcAttr.getValue() != cuf::ProcAttribute::Host) { + funcOp.walk([&](fir::AddrOfOp addrOfOp) { + processAddrOfOp(addrOfOp, symbolTable, candidates); + }); } - funcOp.walk([&](fir::AddrOfOp addrOfOp) { - processAddrOfOp(addrOfOp, symbolTable, onlyConstant); - }); } class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase { @@ -67,9 +62,10 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase { if (!mod) return signalPassFailure(); + llvm::DenseSet candidates; mlir::SymbolTable symTable(mod); mod.walk([&](mlir::func::FuncOp funcOp) { - prepareImplicitDeviceGlobals(funcOp, symTable); + prepareImplicitDeviceGlobals(funcOp, symTable, candidates); return mlir::WalkResult::advance(); }); @@ -80,22 +76,15 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase { return signalPassFailure(); mlir::SymbolTable gpuSymTable(gpuMod); for (auto globalOp : mod.getOps()) { - auto attr = globalOp.getDataAttrAttr(); - if (!attr) - continue; - switch (attr.getValue()) { - case cuf::DataAttribute::Device: - case cuf::DataAttribute::Constant: - case cuf::DataAttribute::Managed: { - auto globalName{globalOp.getSymbol().getValue()}; - if (gpuSymTable.lookup(globalName)) { - break; - } - gpuSymTable.insert(globalOp->clone()); - } break; - default: + if (cuf::isRegisteredDeviceGlobal(globalOp)) + candidates.insert(globalOp); + } + for (auto globalOp : candidates) { + auto globalName{globalOp.getSymbol().getValue()}; + if (gpuSymTable.lookup(globalName)) { break; } + gpuSymTable.insert(globalOp->clone()); } } }; diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 7f6843d66d39f..1df82e6accfed 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -81,15 +81,6 @@ static bool hasDoubleDescriptors(OpTy op) { return false; } -bool isDeviceGlobal(fir::GlobalOp op) { - auto attr = op.getDataAttr(); - if (attr && (*attr == cuf::DataAttribute::Device || - *attr == cuf::DataAttribute::Managed || - *attr == cuf::DataAttribute::Constant)) - return true; - return false; -} - static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter, mlir::Location loc, mlir::Type toTy, mlir::Value val) { @@ -388,7 +379,7 @@ struct DeclareOpConversion : public mlir::OpRewritePattern { if (auto addrOfOp = op.getMemref().getDefiningOp()) { if (auto global = symTab.lookup( addrOfOp.getSymbol().getRootReference().getValue())) { - if (isDeviceGlobal(global)) { + if (cuf::isRegisteredDeviceGlobal(global)) { rewriter.setInsertionPointAfter(addrOfOp); auto mod = op->getParentOfType(); fir::FirOpBuilder builder(rewriter, mod); @@ -833,7 +824,7 @@ class CUFOpConversion : public fir::impl::CUFOpConversionBase { addrOfOp.getSymbol().getRootReference().getValue())) { if (mlir::isa(fir::unwrapRefType(global.getType()))) return true; - if (isDeviceGlobal(global)) + if (cuf::isRegisteredDeviceGlobal(global)) return false; } } diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90 index 29efdb083878a..eb118ccee311c 100644 --- a/flang/test/Fir/CUDA/cuda-constructor-2.f90 +++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90 @@ -39,7 +39,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git 3372303188df0f7f8ac26e7ab610cf8b0f716d42)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { - fir.global @_QMiso_c_bindingECc_int {data_attr = #cuf.cuda} constant : i32 + fir.global @_QMiso_c_bindingECc_int constant : i32 fir.type_info @_QM__fortran_builtinsT__builtin_c_ptr noinit nodestroy nofinal : !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> diff --git a/flang/test/Fir/CUDA/cuda-global-addr.mlir b/flang/test/Fir/CUDA/cuda-global-addr.mlir index 2baead4010f5c..94ee74736f650 100644 --- a/flang/test/Fir/CUDA/cuda-global-addr.mlir +++ b/flang/test/Fir/CUDA/cuda-global-addr.mlir @@ -1,4 +1,4 @@ -// RUN: fir-opt --cuf-convert %s | FileCheck %s +// RUN: fir-opt --split-input-file --cuf-convert %s | FileCheck %s module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} { fir.global @_QMmod1Eadev {data_attr = #cuf.cuda} : !fir.array<10xi32> { @@ -34,3 +34,33 @@ func.func @_QQmain() attributes {fir.bindc_name = "test"} { // CHECK: %[[ARRAY_COOR:.*]] = fir.array_coor %[[DECL]](%{{.*}}) %c4{{.*}} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref // CHECK: %[[ARRAY_COOR_PTR:.*]] = fir.convert %[[ARRAY_COOR]] : (!fir.ref) -> !fir.llvm_ptr // CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[ARRAY_COOR_PTR]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.llvm_ptr, i64, i32, !fir.ref, i32) -> none + +// ----- + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} { + + fir.global @_QMdevmodEdarray {data_attr = #cuf.cuda} : !fir.box>> { + %c0 = arith.constant 0 : index + %0 = fir.zero_bits !fir.heap> + %1 = fir.shape %c0 : (index) -> !fir.shape<1> + %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.heap>, !fir.shape<1>) -> !fir.box>> + fir.has_value %2 : !fir.box>> + } + func.func @_QQmain() attributes {fir.bindc_name = "arraysize"} { + %0 = fir.address_of(@_QMiso_c_bindingECc_int) : !fir.ref + %1 = fir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QMiso_c_bindingECc_int"} : (!fir.ref) -> !fir.ref + %2 = fir.address_of(@_QMdevmodEdarray) : !fir.ref>>> + %3 = fir.declare %2 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMdevmodEdarray"} : (!fir.ref>>>) -> !fir.ref>>> + %4 = fir.alloca i32 {bindc_name = "exp", uniq_name = "_QFEexp"} + %5 = fir.declare %4 {uniq_name = "_QFEexp"} : (!fir.ref) -> !fir.ref + %6 = fir.alloca i32 {bindc_name = "hsize", uniq_name = "_QFEhsize"} + %7 = fir.declare %6 {uniq_name = "_QFEhsize"} : (!fir.ref) -> !fir.ref + return + } + fir.global @_QMiso_c_bindingECc_int constant : i32 +} + +// We cannot call _FortranACUFGetDeviceAddress on a constant global. +// There is no symbol for it and the call would result into an unresolved reference. +// CHECK-NOT: fir.call {{.*}}GetDeviceAddress + diff --git a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 index 772e2696171a6..5a4cc8590f416 100644 --- a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 +++ b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 @@ -23,7 +23,7 @@ // Test that global used in device function are flagged with the correct // CHECK: %[[GLOBAL:.*]] = fir.address_of(@_QQcl[[SYMBOL:.*]]) : !fir.ref> // CHECK: %[[CONV:.*]] = fir.convert %[[GLOBAL]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAioBeginExternalListOutput(%{{.*}}, %[[CONV]], %{{.*}}) fastmath : (i32, !fir.ref, i32) -> !fir.ref -// CHECK: fir.global linkonce @_QQcl[[SYMBOL]] {data_attr = #cuf.cuda} constant : !fir.char<1,32> +// CHECK: fir.global linkonce @_QQcl[[SYMBOL]] constant : !fir.char<1,32> // CHECK-LABEL: gpu.module @cuda_device_mod // CHECK: fir.global linkonce @_QQclX6995815537abaf90e86ce166af128f3a @@ -99,10 +99,11 @@ // Test that global used in device function are flagged with the correct fir.has_value %0 : !fir.char<1,11> } -// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda} constant : !fir.char<1,11> +// Checking that a constant fir.global that is only used in host code is not copied over to the device +// CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant : !fir.char<1,11> // CHECK-LABEL: gpu.module @cuda_device_mod -// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda} constant +// CHECK-NOT: fir.global linkonce @_QQclX5465737420504153534544 // ----- @@ -140,7 +141,8 @@ // Test that global used in device function are flagged with the correct } func.func private @_FortranAioEndIoStatement(!fir.ref) -> i32 attributes {fir.io, fir.runtime} -// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda} constant : !fir.char<1,11> +// Checking that a constant fir.global that is used in device code is copied over to the device +// CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant : !fir.char<1,11> // CHECK-LABEL: gpu.module @cuda_device_mod -// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda} constant +// CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant