Skip to content

Commit 91e74bd

Browse files
committed
Fix address cast in gpu mod
1 parent 2fc7512 commit 91e74bd

File tree

2 files changed

+33
-12
lines changed

2 files changed

+33
-12
lines changed

flang/lib/Optimizer/CodeGen/CodeGen.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,18 @@ struct AddrOfOpConversion : public fir::FIROpConversion<fir::AddrOfOp> {
176176
llvm::LogicalResult
177177
matchAndRewrite(fir::AddrOfOp addr, OpAdaptor adaptor,
178178
mlir::ConversionPatternRewriter &rewriter) const override {
179+
180+
if (auto gpuMod = addr->getParentOfType<mlir::gpu::GPUModuleOp>()) {
181+
auto global = gpuMod.lookupSymbol<mlir::LLVM::GlobalOp>(addr.getSymbol());
182+
if (global) {
183+
replaceWithAddrOfOrASCast(
184+
rewriter, addr->getLoc(), global.getAddrSpace(),
185+
getProgramAddressSpace(rewriter), global.getSymName(),
186+
convertType(addr.getType()), addr);
187+
}
188+
return mlir::success();
189+
}
190+
179191
auto global = addr->getParentOfType<mlir::ModuleOp>()
180192
.lookupSymbol<mlir::LLVM::GlobalOp>(addr.getSymbol());
181193
replaceWithAddrOfOrASCast(

flang/test/Fir/CUDA/cuda-code-gen.mlir

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -287,19 +287,28 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
287287

288288
// -----
289289

290-
fir.global @_QMkernelsEinitial_val {data_attr = #cuf.cuda<constant>} : i32 {
291-
%0 = fir.zero_bits i32
292-
fir.has_value %0 : i32
293-
}
294-
func.func @_QMkernelsPassign(%arg0: !fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
295-
%1 = fir.address_of(@_QMkernelsEinitial_val) : !fir.ref<i32>
296-
%14 = fir.load %1 : !fir.ref<i32>
297-
fir.store %14 to %arg0 : !fir.ref<i32>
298-
return
290+
module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
291+
gpu.module @cuda_device_mod [#nvvm.target<chip = "sm_90", features = "+ptx75", link = ["/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_builtin_intrinsics_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_utils_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_cpp_builtins.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_runtime_cc90.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_utils_runtime_cc90.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12//libdevice_nvhpc_cuda_runtime_builtins_cc90.10.bc", "/proj/ng/Linux_x86_64/dev/cuda/12.9/nvvm/libdevice/libdevice.10.bc"]>] attributes {llvm.data_layout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"} {
292+
fir.global @_QMkernelsEinitial_val {data_attr = #cuf.cuda<constant>} : i32 {
293+
%0 = fir.zero_bits i32
294+
fir.has_value %0 : i32
295+
}
296+
gpu.func @_QMkernelsPassign(%arg0: !fir.ref<!fir.array<?xi32>>) kernel {
297+
%c-1 = arith.constant -1 : index
298+
%c1_i32 = arith.constant 1 : i32
299+
%0 = arith.constant 1 : i32
300+
%1 = arith.addi %0, %c1_i32 : i32
301+
%2 = fir.address_of(@_QMkernelsEinitial_val) : !fir.ref<i32>
302+
%4 = fir.load %2 : !fir.ref<i32>
303+
%5 = fir.convert %1 : (i32) -> i64
304+
%6 = fircg.ext_array_coor %arg0(%c-1)<%5> : (!fir.ref<!fir.array<?xi32>>, index, i64) -> !fir.ref<i32>
305+
fir.store %4 to %6 : !fir.ref<i32>
306+
gpu.return
307+
}
308+
}
299309
}
300310

301-
// CHECK: llvm.mlir.global external @_QMkernelsEinitial_val() {addr_space = 4 : i32} : i32
302-
// CHECK-LABEL: llvm.func @_QMkernelsPassign
311+
// CHECK: llvm.mlir.global external @_QMkernelsEinitial_val() {addr_space = 4 : i32} : i32
312+
// CHECK-LABEL: gpu.func @_QMkernelsPassign
303313
// CHECK: %[[ADDROF:.*]] = llvm.mlir.addressof @_QMkernelsEinitial_val : !llvm.ptr<4>
304314
// CHECK: %{{.*}} = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<4> to !llvm.ptr
305-

0 commit comments

Comments
 (0)