Skip to content

Commit 2fb2d7e

Browse files
authored
[flang][cuda] Change how to handle static shared memory variables (#170388)
Generate one global per static shared variable so the alignment can be set separately. Dynamic shared memory is unchanged.
1 parent d2accd3 commit 2fb2d7e

File tree

8 files changed

+88
-56
lines changed

8 files changed

+88
-56
lines changed

flang/include/flang/Optimizer/Builder/CUFCommon.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
#include "mlir/IR/BuiltinOps.h"
1515

1616
static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod";
17-
static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem";
17+
static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem__";
1818

1919
namespace fir {
2020
class FirOpBuilder;

flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,8 @@ def cuf_SharedMemoryOp
351351
OptionalAttr<StrAttr>:$bindc_name, Variadic<AnyIntegerType>:$typeparams,
352352
Variadic<AnyIntegerType>:$shape,
353353
// offset in bytes from the shared memory base address.
354-
Optional<AnyIntegerType>:$offset, OptionalAttr<I64Attr>:$alignment);
354+
Optional<AnyIntegerType>:$offset, OptionalAttr<I64Attr>:$alignment,
355+
UnitAttr:$isStatic);
355356

356357
let results = (outs fir_ReferenceType:$ptr);
357358

flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,8 @@ void cuf::SharedMemoryOp::build(
333333
bindcName.empty() ? mlir::StringAttr{} : builder.getStringAttr(bindcName);
334334
build(builder, result, wrapAllocaResultType(inType),
335335
mlir::TypeAttr::get(inType), nameAttr, bindcAttr, typeparams, shape,
336-
/*offset=*/mlir::Value{}, /*alignment=*/mlir::IntegerAttr{});
336+
/*offset=*/mlir::Value{}, /*alignment=*/mlir::IntegerAttr{},
337+
/*isStatic=*/nullptr);
337338
result.addAttributes(attributes);
338339
}
339340

flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp

Lines changed: 58 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,43 @@ static bool isAssumedSize(mlir::ValueRange shape) {
4646
return false;
4747
}
4848

49+
static void createSharedMemoryGlobal(fir::FirOpBuilder &builder,
50+
mlir::Location loc, llvm::StringRef prefix,
51+
llvm::StringRef suffix,
52+
mlir::gpu::GPUModuleOp gpuMod,
53+
mlir::Type sharedMemType, unsigned size,
54+
unsigned align, bool isDynamic) {
55+
std::string sharedMemGlobalName =
56+
isDynamic ? (prefix + llvm::Twine(cudaSharedMemSuffix)).str()
57+
: (prefix + llvm::Twine(cudaSharedMemSuffix) + suffix).str();
58+
59+
mlir::OpBuilder::InsertionGuard guard(builder);
60+
builder.setInsertionPointToEnd(gpuMod.getBody());
61+
62+
mlir::StringAttr linkage = isDynamic ? builder.createExternalLinkage()
63+
: builder.createInternalLinkage();
64+
llvm::SmallVector<mlir::NamedAttribute> attrs;
65+
auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
66+
gpuMod.getContext());
67+
attrs.push_back(mlir::NamedAttribute(
68+
fir::GlobalOp::getDataAttrAttrName(globalOpName),
69+
cuf::DataAttributeAttr::get(gpuMod.getContext(),
70+
cuf::DataAttribute::Shared)));
71+
72+
mlir::DenseElementsAttr init = {};
73+
mlir::Type i8Ty = builder.getI8Type();
74+
if (size > 0) {
75+
auto vecTy = mlir::VectorType::get(
76+
static_cast<fir::SequenceType::Extent>(size), i8Ty);
77+
mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
78+
init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
79+
}
80+
auto sharedMem =
81+
fir::GlobalOp::create(builder, loc, sharedMemGlobalName, false, false,
82+
sharedMemType, init, linkage, attrs);
83+
sharedMem.setAlignment(align);
84+
}
85+
4986
struct CUFComputeSharedMemoryOffsetsAndSize
5087
: public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
5188
CUFComputeSharedMemoryOffsetsAndSize> {
@@ -108,18 +145,23 @@ struct CUFComputeSharedMemoryOffsetsAndSize
108145
crtDynOffset, dynSize);
109146
else
110147
crtDynOffset = dynSize;
111-
112-
continue;
148+
} else {
149+
// Static shared memory.
150+
auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
151+
loc, sharedOp.getInType(), *dl, kindMap);
152+
createSharedMemoryGlobal(
153+
builder, sharedOp.getLoc(), funcOp.getName(),
154+
*sharedOp.getBindcName(), gpuMod,
155+
fir::SequenceType::get(size, i8Ty), size,
156+
sharedOp.getAlignment() ? *sharedOp.getAlignment() : align,
157+
/*isDynamic=*/false);
158+
mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
159+
sharedOp.getOffsetMutable().assign(zero);
160+
if (!sharedOp.getAlignment())
161+
sharedOp.setAlignment(align);
162+
sharedOp.setIsStatic(true);
163+
++nbStaticSharedVariables;
113164
}
114-
auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
115-
sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
116-
++nbStaticSharedVariables;
117-
mlir::Value offset = builder.createIntegerConstant(
118-
loc, i32Ty, llvm::alignTo(sharedMemSize, align));
119-
sharedOp.getOffsetMutable().assign(offset);
120-
sharedMemSize =
121-
llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
122-
alignment = std::max(alignment, align);
123165
}
124166

125167
if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0)
@@ -130,35 +172,13 @@ struct CUFComputeSharedMemoryOffsetsAndSize
130172
funcOp.getLoc(),
131173
"static and dynamic shared variables in a single kernel");
132174

133-
mlir::DenseElementsAttr init = {};
134-
if (sharedMemSize > 0) {
135-
auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
136-
mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
137-
init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
138-
}
175+
if (nbStaticSharedVariables > 0)
176+
continue;
139177

140-
// Create the shared memory global where each shared variable will point
141-
// to.
142178
auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
143-
std::string sharedMemGlobalName =
144-
(funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
145-
// Dynamic shared memory needs an external linkage while static shared
146-
// memory needs an internal linkage.
147-
mlir::StringAttr linkage = nbDynamicSharedVariables > 0
148-
? builder.createExternalLinkage()
149-
: builder.createInternalLinkage();
150-
builder.setInsertionPointToEnd(gpuMod.getBody());
151-
llvm::SmallVector<mlir::NamedAttribute> attrs;
152-
auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
153-
gpuMod.getContext());
154-
attrs.push_back(mlir::NamedAttribute(
155-
fir::GlobalOp::getDataAttrAttrName(globalOpName),
156-
cuf::DataAttributeAttr::get(gpuMod.getContext(),
157-
cuf::DataAttribute::Shared)));
158-
auto sharedMem = fir::GlobalOp::create(
159-
builder, funcOp.getLoc(), sharedMemGlobalName, false, false,
160-
sharedMemType, init, linkage, attrs);
161-
sharedMem.setAlignment(alignment);
179+
createSharedMemoryGlobal(builder, funcOp.getLoc(), funcOp.getName(), "",
180+
gpuMod, sharedMemType, sharedMemSize, alignment,
181+
/*isDynamic=*/true);
162182
}
163183
}
164184
};

flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,8 +249,13 @@ struct CUFSharedMemoryOpConversion
249249
"cuf.shared_memory must have an offset for code gen");
250250

251251
auto gpuMod = op->getParentOfType<gpu::GPUModuleOp>();
252+
252253
std::string sharedGlobalName =
253-
(getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str();
254+
op.getIsStatic()
255+
? (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix) +
256+
*op.getBindcName())
257+
.str()
258+
: (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str();
254259
mlir::Value sharedGlobalAddr =
255260
createAddressOfOp(rewriter, loc, gpuMod, sharedGlobalName);
256261

flang/test/Fir/CUDA/cuda-code-gen.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,9 +201,9 @@ func.func @_QMm1Psub1(%arg0: !fir.box<!fir.array<?xi32>> {cuf.data_attr = #cuf.c
201201

202202
// -----
203203

204-
fir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
204+
fir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
205205

206-
// CHECK: llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
206+
// CHECK: llvm.mlir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
207207

208208
// -----
209209

flang/test/Fir/CUDA/cuda-shared-offset.mlir

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
1717
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
1818
// CHECK: gpu.return
1919
// CHECK: }
20-
// CHECK: fir.global external @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
20+
// CHECK: fir.global external @_QPdynshared__shared_mem__ {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
2121

2222
// -----
2323

@@ -43,15 +43,20 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
4343

4444
// CHECK-LABEL: gpu.module @cuda_device_mod
4545
// CHECK: gpu.func @_QPshared_static()
46-
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
47-
// CHECK: cuf.shared_memory[%c4{{.*}} : i32] i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
48-
// CHECK: cuf.shared_memory[%c8{{.*}} : i32] i32 {bindc_name = "c", uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
49-
// CHECK: cuf.shared_memory[%c12{{.*}} : i32] i32 {bindc_name = "d", uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
50-
// CHECK: cuf.shared_memory[%c16{{.*}} : i32] i64 {bindc_name = "e", uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
51-
// CHECK: cuf.shared_memory[%c24{{.*}} : i32] f32 {bindc_name = "r", uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
46+
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "a", isStatic, uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
47+
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "b", isStatic, uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
48+
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "c", isStatic, uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
49+
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "d", isStatic, uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
50+
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i64 align 8 {bindc_name = "e", isStatic, uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
51+
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] f32 align 4 {bindc_name = "r", isStatic, uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
5252
// CHECK: gpu.return
5353
// CHECK: }
54-
// CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
54+
// CHECK: fir.global internal @_QPshared_static__shared_mem__a(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
55+
// CHECK: fir.global internal @_QPshared_static__shared_mem__b(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
56+
// CHECK: fir.global internal @_QPshared_static__shared_mem__c(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
57+
// CHECK: fir.global internal @_QPshared_static__shared_mem__d(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
58+
// CHECK: fir.global internal @_QPshared_static__shared_mem__e(dense<0> : vector<8xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<8xi8>
59+
// CHECK: fir.global internal @_QPshared_static__shared_mem__r(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
5560
// CHECK: }
5661
// CHECK: }
5762

@@ -159,4 +164,4 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
159164
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
160165
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
161166

162-
// CHECK: fir.global external @_QMmtestsPtestany__shared_mem {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
167+
// CHECK: fir.global external @_QMmtestsPtestany__shared_mem__ {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>

flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
99
%1 = cuf.shared_memory [%c4 : i32] i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
1010
llvm.return
1111
}
12-
llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
12+
llvm.mlir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
1313
}
1414
}
1515

1616
// CHECK-LABEL: llvm.func @_QPshared_static()
17-
// CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem : !llvm.ptr<3>
17+
// CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem__ : !llvm.ptr<3>
1818
// CHECK: %[[ADDRCAST0:.*]] = llvm.addrspacecast %[[ADDR0]] : !llvm.ptr<3> to !llvm.ptr
1919
// CHECK: %[[A:.*]] = llvm.getelementptr %[[ADDRCAST0]][%c0{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8
20-
// CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem : !llvm.ptr<3>
20+
// CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem__ : !llvm.ptr<3>
2121
// CHECK: %[[ADDRCAST1:.*]] = llvm.addrspacecast %[[ADDR1]] : !llvm.ptr<3> to !llvm.ptr
2222
// CHECK: %[[B:.*]] = llvm.getelementptr %[[ADDRCAST1]][%c4{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8

0 commit comments

Comments
 (0)