Skip to content

Commit 1d05d69

Browse files
authored
[flang][cuda] Fix offset with multiple assumed size shared array (#154844)
When multiple assumed size variable are used in a kernel with dynamic shared memory, each variable use the 0 offset. Update the pass to account for that. ``` attributes(global) subroutine testany( a ) real(4), shared :: smasks(*) real(8), shared :: dmasks(*) end subroutine ```
1 parent fa67855 commit 1d05d69

File tree

2 files changed

+49
-4
lines changed

2 files changed

+49
-4
lines changed

flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ using namespace Fortran::runtime::cuda;
3838

3939
namespace {
4040

41+
static bool isAssumedSize(mlir::ValueRange shape) {
42+
if (shape.size() != 1)
43+
return false;
44+
std::optional<std::int64_t> val = fir::getIntIfConstant(shape[0]);
45+
if (val && *val == -1)
46+
return true;
47+
return false;
48+
}
49+
4150
struct CUFComputeSharedMemoryOffsetsAndSize
4251
: public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
4352
CUFComputeSharedMemoryOffsetsAndSize> {
@@ -82,12 +91,12 @@ struct CUFComputeSharedMemoryOffsetsAndSize
8291
alignment = std::max(alignment, align);
8392
uint64_t tySize = dl->getTypeSize(ty);
8493
++nbDynamicSharedVariables;
85-
if (crtDynOffset) {
86-
sharedOp.getOffsetMutable().assign(
87-
builder.createConvert(loc, i32Ty, crtDynOffset));
88-
} else {
94+
if (isAssumedSize(sharedOp.getShape()) || !crtDynOffset) {
8995
mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
9096
sharedOp.getOffsetMutable().assign(zero);
97+
} else {
98+
sharedOp.getOffsetMutable().assign(
99+
builder.createConvert(loc, i32Ty, crtDynOffset));
91100
}
92101

93102
mlir::Value dynSize =

flang/test/Fir/CUDA/cuda-shared-offset.mlir

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,4 +121,40 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
121121
// CHECK-LABEL: gpu.func @_QPnoshared()
122122
// CHECK-NOT: fir.global internal @_QPnoshared__shared_mem
123123

124+
// -----
125+
126+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
127+
gpu.module @cuda_device_mod {
128+
gpu.func @_QMmtestsPtestany(%arg0: !fir.ref<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
129+
%0 = fir.dummy_scope : !fir.dscope
130+
%c-1 = arith.constant -1 : index
131+
%1 = fir.shape %c-1 : (index) -> !fir.shape<1>
132+
%2:2 = hlfir.declare %arg0(%1) dummy_scope %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QMmtestsFtestanyEa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
133+
%3 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockdim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
134+
%4:2 = hlfir.declare %3 {uniq_name = "_QM__fortran_builtinsE__builtin_blockdim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
135+
%5 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
136+
%6:2 = hlfir.declare %5 {uniq_name = "_QM__fortran_builtinsE__builtin_blockidx"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
137+
%c-1_0 = arith.constant -1 : index
138+
%7 = cuf.shared_memory !fir.array<?xf64>, %c-1_0 : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
139+
%8 = fir.shape %c-1_0 : (index) -> !fir.shape<1>
140+
%9:2 = hlfir.declare %7(%8) {data_attr = #cuf.cuda<shared>, uniq_name = "_QMmtestsFtestanyEdmasks"} : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>)
141+
%10 = fir.address_of(@_QM__fortran_builtinsE__builtin_griddim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
142+
%11:2 = hlfir.declare %10 {uniq_name = "_QM__fortran_builtinsE__builtin_griddim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
143+
%12 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMmtestsFtestanyEi"}
144+
%13:2 = hlfir.declare %12 {uniq_name = "_QMmtestsFtestanyEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
145+
%14 = fir.alloca i32 {bindc_name = "iam", uniq_name = "_QMmtestsFtestanyEiam"}
146+
%15:2 = hlfir.declare %14 {uniq_name = "_QMmtestsFtestanyEiam"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
147+
%16 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QMmtestsFtestanyEj"}
148+
%17:2 = hlfir.declare %16 {uniq_name = "_QMmtestsFtestanyEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
149+
%c-1_1 = arith.constant -1 : index
150+
%18 = cuf.shared_memory !fir.array<?xf32>, %c-1_1 : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
151+
%19 = fir.shape %c-1_1 : (index) -> !fir.shape<1>
152+
%20:2 = hlfir.declare %18(%19) {data_attr = #cuf.cuda<shared>, uniq_name = "_QMmtestsFtestanyEsmasks"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
153+
gpu.return
154+
}
155+
}
156+
}
124157

158+
// CHECK-LABEL: gpu.func @_QMmtestsPtestany
159+
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %c-1{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
160+
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>

0 commit comments

Comments
 (0)