diff --git a/flang/include/flang/Optimizer/Transforms/CUFCommon.h b/flang/include/flang/Optimizer/Transforms/CUFCommon.h index b88133489df5e..f019d1893bda4 100644 --- a/flang/include/flang/Optimizer/Transforms/CUFCommon.h +++ b/flang/include/flang/Optimizer/Transforms/CUFCommon.h @@ -20,6 +20,8 @@ namespace cuf { mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod, mlir::SymbolTable &symTab); +bool isInCUDADeviceContext(mlir::Operation *op); + } // namespace cuf #endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_ diff --git a/flang/lib/Optimizer/Transforms/CUFCommon.cpp b/flang/lib/Optimizer/Transforms/CUFCommon.cpp index 162df8f9cab9c..5b7631bbacb5f 100644 --- a/flang/lib/Optimizer/Transforms/CUFCommon.cpp +++ b/flang/lib/Optimizer/Transforms/CUFCommon.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "flang/Optimizer/Transforms/CUFCommon.h" +#include "flang/Optimizer/Dialect/CUF/CUFOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" /// Retrieve or create the CUDA Fortran GPU module in the give in \p mod. @@ -26,3 +28,18 @@ mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod, symTab.insert(gpuMod, insertPt); return gpuMod; } + +bool cuf::isInCUDADeviceContext(mlir::Operation *op) { + if (!op) + return false; + if (op->getParentOfType() || + op->getParentOfType()) + return true; + if (auto funcOp = op->getParentOfType()) { + if (auto cudaProcAttr = funcOp->getAttrOfType( + cuf::getProcAttrName())) { + return cudaProcAttr.getValue() != cuf::ProcAttribute::Host; + } + } + return false; +} diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp index c61179a7460e3..d3567f453fceb 100644 --- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp @@ -31,6 +31,7 @@ #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/FIRContext.h" #include "flang/Optimizer/HLFIR/HLFIRDialect.h" +#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Optimizer/Transforms/Passes.h" #include "flang/Optimizer/Transforms/Utils.h" #include "flang/Runtime/entry-names.h" @@ -1276,6 +1277,8 @@ void SimplifyIntrinsicsPass::runOnOperation() { fir::KindMapping kindMap = fir::getKindMapping(module); module.walk([&](mlir::Operation *op) { if (auto call = mlir::dyn_cast(op)) { + if (cuf::isInCUDADeviceContext(op)) + return; if (mlir::SymbolRefAttr callee = call.getCalleeAttr()) { mlir::StringRef funcName = callee.getLeafReference().getValue(); // Replace call to runtime function for SUM when it has single diff --git a/flang/test/Fir/CUDA/cuda-device-context.mlir b/flang/test/Fir/CUDA/cuda-device-context.mlir new file mode 100644 index 0000000000000..89d68721ccaf2 --- /dev/null +++ b/flang/test/Fir/CUDA/cuda-device-context.mlir @@ -0,0 +1,49 @@ +// RUN: fir-opt --simplify-intrinsics %s | FileCheck %s + +func.func @_QPsum_in_device(%arg0: !fir.ref> {cuf.data_attr = #cuf.cuda, fir.bindc_name = "a"}, %arg1: i32 {fir.bindc_name = "n"}) attributes {cuf.proc_attr = #cuf.cuda_proc} { + %c5_i32 = arith.constant 5 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.shape %c-1 : (index) -> !fir.shape<1> + %2 = fir.declare %arg0(%1) dummy_scope %0 {data_attr = #cuf.cuda, uniq_name = "_QFsum_in_deviceEa"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> !fir.ref> + %3 = fir.embox %2(%1) : (!fir.ref>, !fir.shape<1>) -> !fir.box> + %4 = fir.alloca i32 + fir.store %arg1 to %4 : !fir.ref + %5 = fir.declare %4 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFsum_in_deviceEn"} : (!fir.ref, !fir.dscope) -> !fir.ref + %12 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsum_in_deviceEi"} + %13 = fir.declare %12 {uniq_name = "_QFsum_in_deviceEi"} : (!fir.ref) -> !fir.ref + %14 = fir.address_of(@_QM__fortran_builtinsE__builtin_threadidx) : !fir.ref> + %18 = fir.load %5 : !fir.ref + %19 = fir.convert %18 : (i32) -> index + %20 = arith.cmpi sgt, %19, %c0 : index + %21 = arith.select %20, %19, %c0 : index + %22 = fir.alloca !fir.array, %21 {bindc_name = "auto", uniq_name = "_QFsum_in_deviceEauto"} + %23 = fir.shape %21 : (index) -> !fir.shape<1> + %24 = fir.declare %22(%23) {uniq_name = "_QFsum_in_deviceEauto"} : (!fir.ref>, !fir.shape<1>) -> !fir.ref> + %25 = fir.embox %24(%23) : (!fir.ref>, !fir.shape<1>) -> !fir.box> + %26 = fir.undefined index + %27 = fir.slice %c1, %19, %c1 : (index, index, index) -> !fir.slice<1> + %28 = fir.embox %24(%23) [%27] : (!fir.ref>, !fir.shape<1>, !fir.slice<1>) -> !fir.box> + %29 = fir.absent !fir.box + %30 = fir.address_of(@_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5) : !fir.ref> + %31 = fir.convert %28 : (!fir.box>) -> !fir.box + %32 = fir.convert %30 : (!fir.ref>) -> !fir.ref + %33 = fir.convert %c0 : (index) -> i32 + %34 = fir.convert %29 : (!fir.box) -> !fir.box + %35 = fir.call @_FortranASumInteger4(%31, %32, %c5_i32, %33, %34) fastmath : (!fir.box, !fir.ref, i32, i32, !fir.box) -> i32 + %36 = fir.load %13 : !fir.ref + %37 = fir.convert %36 : (i32) -> i64 + %38 = fir.array_coor %2(%1) %37 : (!fir.ref>, !fir.shape<1>, i64) -> !fir.ref + fir.store %35 to %38 : !fir.ref + return +} + +// Check that intrinsic simplification is disabled in CUDA Fortran context. The simplified intrinsic is +// created in the module op but the device func will be migrated into a gpu module op resulting in a +// missing symbol error. +// The simplified intrinsic could also be migrated to the gpu module but the choice has not be made +// at this point. +// CHECK-LABEL: func.func @_QPsum_in_device +// CHECK-NOT: fir.call @_FortranASumInteger4x1_contract_simplified