Skip to content

Commit 48657bf

Browse files
authored
[flang][cuda] Handle launch of cooperative kernel (#124362)
Add `CUFLaunchCooperativeKernel` entry points and lower gpu.launch_func with grid_global attribute to this entry point.
1 parent 6409799 commit 48657bf

File tree

4 files changed

+116
-6
lines changed

4 files changed

+116
-6
lines changed

flang/include/flang/Runtime/CUDA/kernel.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
2828
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
2929
int32_t smem, void **params, void **extra);
3030

31+
void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
32+
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
33+
intptr_t blockZ, int32_t smem, void **params, void **extra);
34+
3135
} // extern "C"
3236

3337
#endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_

flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -139,20 +139,26 @@ struct GPULaunchKernelConversion
139139
adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
140140
dynamicMemorySize, kernelArgs, nullPtr});
141141
} else {
142-
auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
143-
RTNAME_STRING(CUFLaunchKernel));
142+
auto procAttr =
143+
op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
144+
bool isGridGlobal =
145+
procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
146+
llvm::StringRef fctName = isGridGlobal
147+
? RTNAME_STRING(CUFLaunchCooperativeKernel)
148+
: RTNAME_STRING(CUFLaunchKernel);
149+
auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
144150
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
145151
voidTy,
146152
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
147153
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
148154
/*isVarArg=*/false);
149-
auto cufLaunchKernel = mlir::SymbolRefAttr::get(
150-
mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
155+
auto cufLaunchKernel =
156+
mlir::SymbolRefAttr::get(mod.getContext(), fctName);
151157
if (!funcOp) {
152158
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
153159
rewriter.setInsertionPointToStart(mod.getBody());
154-
auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
155-
loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
160+
auto launchKernelFuncOp =
161+
rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
156162
launchKernelFuncOp.setVisibility(
157163
mlir::SymbolTable::Visibility::Private);
158164
}

flang/runtime/CUDA/kernel.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,4 +151,69 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
151151
CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
152152
}
153153

154+
void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
155+
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
156+
intptr_t blockZ, int32_t smem, void **params, void **extra) {
157+
dim3 gridDim;
158+
gridDim.x = gridX;
159+
gridDim.y = gridY;
160+
gridDim.z = gridZ;
161+
dim3 blockDim;
162+
blockDim.x = blockX;
163+
blockDim.y = blockY;
164+
blockDim.z = blockZ;
165+
unsigned nbNegGridDim{0};
166+
if (gridX < 0) {
167+
++nbNegGridDim;
168+
}
169+
if (gridY < 0) {
170+
++nbNegGridDim;
171+
}
172+
if (gridZ < 0) {
173+
++nbNegGridDim;
174+
}
175+
if (nbNegGridDim == 1) {
176+
int maxBlocks, nbBlocks, dev, multiProcCount;
177+
cudaError_t err1, err2;
178+
nbBlocks = blockDim.x * blockDim.y * blockDim.z;
179+
cudaGetDevice(&dev);
180+
err1 = cudaDeviceGetAttribute(
181+
&multiProcCount, cudaDevAttrMultiProcessorCount, dev);
182+
err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
183+
&maxBlocks, kernel, nbBlocks, smem);
184+
if (err1 == cudaSuccess && err2 == cudaSuccess) {
185+
maxBlocks = multiProcCount * maxBlocks;
186+
}
187+
if (maxBlocks > 0) {
188+
if (gridX > 0) {
189+
maxBlocks = maxBlocks / gridDim.x;
190+
}
191+
if (gridY > 0) {
192+
maxBlocks = maxBlocks / gridDim.y;
193+
}
194+
if (gridZ > 0) {
195+
maxBlocks = maxBlocks / gridDim.z;
196+
}
197+
if (maxBlocks < 1) {
198+
maxBlocks = 1;
199+
}
200+
if (gridX < 0) {
201+
gridDim.x = maxBlocks;
202+
}
203+
if (gridY < 0) {
204+
gridDim.y = maxBlocks;
205+
}
206+
if (gridZ < 0) {
207+
gridDim.z = maxBlocks;
208+
}
209+
}
210+
} else if (nbNegGridDim > 1) {
211+
Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
212+
terminator.Crash("Too many invalid grid dimensions");
213+
}
214+
cudaStream_t stream = 0; // TODO stream managment
215+
CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(
216+
kernel, gridDim, blockDim, params, smem, stream));
217+
}
218+
154219
} // extern "C"

flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,3 +131,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
131131
// CHECK-LABEL: llvm.func @_QQmain()
132132
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
133133
// CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
134+
135+
// -----
136+
137+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
138+
llvm.func @_QMmod1Phost_sub() {
139+
%0 = llvm.mlir.constant(1 : i32) : i32
140+
%1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
141+
%2 = llvm.mlir.constant(40 : i64) : i64
142+
%3 = llvm.mlir.constant(16 : i32) : i32
143+
%4 = llvm.mlir.constant(25 : i32) : i32
144+
%5 = llvm.mlir.constant(21 : i32) : i32
145+
%6 = llvm.mlir.constant(17 : i32) : i32
146+
%7 = llvm.mlir.constant(1 : index) : i64
147+
%8 = llvm.mlir.constant(27 : i32) : i32
148+
%9 = llvm.mlir.constant(6 : i32) : i32
149+
%10 = llvm.mlir.constant(1 : i32) : i32
150+
%11 = llvm.mlir.constant(0 : i32) : i32
151+
%12 = llvm.mlir.constant(10 : index) : i64
152+
%13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
153+
%14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
154+
gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
155+
llvm.return
156+
}
157+
llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
158+
llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> {
159+
%0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
160+
llvm.return %0 : !llvm.array<2 x i8>
161+
}
162+
llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
163+
llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
164+
gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
165+
}
166+
167+
// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
168+
// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel

0 commit comments

Comments
 (0)