diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp index 7cdb2f7ffe27d..dd204126be5db 100644 --- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp +++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp @@ -11,11 +11,13 @@ #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/CodeGen/Target.h" +#include "flang/Optimizer/CodeGen/TypeConverter.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "flang/Optimizer/Dialect/FIRAttr.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" +#include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Support/DataLayout.h" #include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/registration.h" @@ -84,6 +86,8 @@ struct CUFAddConstructor auto registeredMod = builder.create( loc, llvmPtrTy, mlir::SymbolRefAttr::get(ctx, gpuMod.getName())); + fir::LLVMTypeConverter typeConverter(mod, /*applyTBAA=*/false, + /*forceUnifiedTBAATree=*/false, *dl); // Register kernels for (auto func : gpuMod.getOps()) { if (func.isKernel()) { @@ -115,17 +119,25 @@ struct CUFAddConstructor fir::factory::createStringLiteral(builder, loc, gblNameStr)); // Global variable size - auto sizeAndAlign = fir::getTypeSizeAndAlignmentOrCrash( - loc, globalOp.getType(), *dl, kindMap); - auto size = - builder.createIntegerConstant(loc, idxTy, sizeAndAlign.first); + std::optional size; + if (auto boxTy = + mlir::dyn_cast(globalOp.getType())) { + mlir::Type structTy = typeConverter.convertBoxTypeAsStruct(boxTy); + size = dl->getTypeSizeInBits(structTy) / 8; + } + if (!size) { + size = fir::getTypeSizeAndAlignmentOrCrash(loc, globalOp.getType(), + *dl, kindMap) + .first; + } + auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size); // Global variable address mlir::Value addr = builder.create( loc, globalOp.resultType(), globalOp.getSymbol()); llvm::SmallVector args{fir::runtime::createArguments( - builder, loc, fTy, registeredMod, addr, gblName, size)}; + builder, loc, fTy, registeredMod, addr, gblName, sizeVal)}; builder.create(loc, func, args); } break; case cuf::DataAttribute::Managed: diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90 index 378dabbb7c7e7..99386abc4fafd 100644 --- a/flang/test/Fir/CUDA/cuda-constructor-2.f90 +++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90 @@ -3,6 +3,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { fir.global @_QMmtestsEn(dense<[3, 4, 5, 6, 7]> : tensor<5xi32>) {data_attr = #cuf.cuda} : !fir.array<5xi32> + fir.global @_QMmtestsEndev {data_attr = #cuf.cuda} : !fir.box>> { + %c0 = arith.constant 0 : index + %0 = fir.zero_bits !fir.heap> + %1 = fircg.ext_embox %0(%c0) {allocator_idx = 2 : i32} : (!fir.heap>, index) -> !fir.box>> + fir.has_value %1 : !fir.box>> + } gpu.module @cuda_device_mod [#nvvm.target] { } @@ -18,5 +24,9 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>) -> !fir.ref // CHECK-DAG: %[[VAR_NAME2:.*]] = fir.convert %[[VAR_NAME]] : (!fir.ref>) -> !fir.ref // CHECK-DAG: %[[CST:.*]] = arith.constant 20 : index -// CHECK-DAG %[[CST2:.*]] = fir.convert %[[CST]] : (index) -> i64 -// CHECK fir.call @_FortranACUFRegisterVariable(%[[MODULE2]], %[[VAR_ADDR2]], %[[VAR_NAME2]], %[[CST2]]) : (!fir.ref>, !fir.ref, !fir.ref, i64) -> none +// CHECK-DAG: %[[CST2:.*]] = fir.convert %[[CST]] : (index) -> i64 +// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE2]], %[[VAR_ADDR2]], %[[VAR_NAME2]], %[[CST2]]) : (!fir.ref>, !fir.ref, !fir.ref, i64) -> none +// CHECK-DAG: %[[BOX:.*]] = fir.address_of(@_QMmtestsEndev) : !fir.ref>>> +// CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref>>>) -> !fir.ref +// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}}) +// diff --git a/flang/test/Fir/CUDA/cuda-register-func.fir b/flang/test/Fir/CUDA/cuda-register-func.fir index 6b0cbfd3aca63..25ab8dd786a4e 100644 --- a/flang/test/Fir/CUDA/cuda-register-func.fir +++ b/flang/test/Fir/CUDA/cuda-register-func.fir @@ -1,6 +1,6 @@ // RUN: fir-opt --cuf-add-constructor %s | FileCheck %s -module attributes {gpu.container_module} { +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { gpu.module @cuda_device_mod { gpu.func @_QPsub_device1() kernel { gpu.return