diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp index 18150c4e595d4..72b894d1cec75 100644 --- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp +++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp @@ -68,6 +68,11 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase { prepareImplicitDeviceGlobals(funcOp, symTable, candidates); return mlir::WalkResult::advance(); }); + mod.walk([&](cuf::KernelOp kernelOp) { + kernelOp.walk([&](fir::AddrOfOp addrOfOp) { + processAddrOfOp(addrOfOp, symTable, candidates); + }); + }); // Copying the device global variable into the gpu module mlir::SymbolTable parentSymTable(mod); diff --git a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 index 5a4cc8590f416..ec5ed06824e22 100644 --- a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 +++ b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 @@ -146,3 +146,48 @@ // Test that global used in device function are flagged with the correct // CHECK-LABEL: gpu.module @cuda_device_mod // CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant + +// ----- + +func.func @_QQmain() attributes {fir.bindc_name = "cufkernel_global"} { + %c10 = arith.constant 10 : index + %c5_i32 = arith.constant 5 : i32 + %c6_i32 = arith.constant 6 : i32 + %c1 = arith.constant 1 : index + %c1_i32 = arith.constant 1 : i32 + %c10_i32 = arith.constant 10 : i32 + %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"} + %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + cuf.kernel<<<%c10_i32, %c1_i32>>> (%arg0 : index) = (%c1 : index) to (%c10 : index) step (%c1 : index) { + %2 = fir.convert %arg0 : (index) -> i32 + fir.store %2 to %1#1 : !fir.ref + %3 = fir.load %1#0 : !fir.ref + %4 = arith.cmpi eq, %3, %c1_i32 : i32 + cf.cond_br %4, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %5 = fir.address_of(@_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5) : !fir.ref> + %6 = fir.convert %5 : (!fir.ref>) -> !fir.ref + %7 = fir.call @_FortranAioBeginExternalListOutput(%c6_i32, %6, %c5_i32) fastmath : (i32, !fir.ref, i32) -> !fir.ref + %8 = fir.load %1#0 : !fir.ref + %9 = fir.call @_FortranAioOutputInteger32(%7, %8) fastmath : (!fir.ref, i32) -> i1 + %10 = fir.call @_FortranAioEndIoStatement(%7) fastmath : (!fir.ref) -> i32 + cf.br ^bb2 + ^bb2: // 2 preds: ^bb0, ^bb1 + "fir.end"() : () -> () + } + return +} +func.func private @_FortranAioBeginExternalListOutput(i32, !fir.ref, i32) -> !fir.ref attributes {fir.io, fir.runtime} +fir.global linkonce @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 constant : !fir.char<1,50> { + %0 = fir.string_lit "/local/home/vclement/llvm-project/build/dummy.cuf\00"(50) : !fir.char<1,50> + fir.has_value %0 : !fir.char<1,50> +} +func.func private @_FortranAioOutputInteger32(!fir.ref, i32) -> i1 attributes {fir.io, fir.runtime} +func.func private @_FortranAioEndIoStatement(!fir.ref) -> i32 attributes {fir.io, fir.runtime} +func.func private @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) +func.func private @_FortranAProgramEndStatement() + +// CHECK-LABEL: func.func @_QQmain() +// CHECK: fir.global linkonce @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 constant : !fir.char<1,50> +// CHECK: gpu.module @cuda_device_mod +// CHECK: fir.global linkonce @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 constant : !fir.char<1,50>