diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h index 85afda09e347a..1f812b580327a 100644 --- a/flang/include/flang/Runtime/CUDA/kernel.h +++ b/flang/include/flang/Runtime/CUDA/kernel.h @@ -28,6 +28,10 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX, intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, void **params, void **extra); +void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX, + intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, + intptr_t blockZ, int32_t smem, void **params, void **extra); + } // extern "C" #endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_ diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp index 60aa401e1cc8c..c469b5a95b044 100644 --- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp @@ -139,20 +139,26 @@ struct GPULaunchKernelConversion adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(), dynamicMemorySize, kernelArgs, nullPtr}); } else { - auto funcOp = mod.lookupSymbol( - RTNAME_STRING(CUFLaunchKernel)); + auto procAttr = + op->getAttrOfType(cuf::getProcAttrName()); + bool isGridGlobal = + procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal; + llvm::StringRef fctName = isGridGlobal + ? RTNAME_STRING(CUFLaunchCooperativeKernel) + : RTNAME_STRING(CUFLaunchKernel); + auto funcOp = mod.lookupSymbol(fctName); auto funcTy = mlir::LLVM::LLVMFunctionType::get( voidTy, {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy}, /*isVarArg=*/false); - auto cufLaunchKernel = mlir::SymbolRefAttr::get( - mod.getContext(), RTNAME_STRING(CUFLaunchKernel)); + auto cufLaunchKernel = + mlir::SymbolRefAttr::get(mod.getContext(), fctName); if (!funcOp) { mlir::OpBuilder::InsertionGuard insertGuard(rewriter); rewriter.setInsertionPointToStart(mod.getBody()); - auto launchKernelFuncOp = rewriter.create( - loc, RTNAME_STRING(CUFLaunchKernel), funcTy); + auto launchKernelFuncOp = + rewriter.create(loc, fctName, funcTy); launchKernelFuncOp.setVisibility( mlir::SymbolTable::Visibility::Private); } diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp index bdc04ccb17672..02d89fb8423a5 100644 --- a/flang/runtime/CUDA/kernel.cpp +++ b/flang/runtime/CUDA/kernel.cpp @@ -151,4 +151,69 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX, CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params)); } +void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX, + intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, + intptr_t blockZ, int32_t smem, void **params, void **extra) { + dim3 gridDim; + gridDim.x = gridX; + gridDim.y = gridY; + gridDim.z = gridZ; + dim3 blockDim; + blockDim.x = blockX; + blockDim.y = blockY; + blockDim.z = blockZ; + unsigned nbNegGridDim{0}; + if (gridX < 0) { + ++nbNegGridDim; + } + if (gridY < 0) { + ++nbNegGridDim; + } + if (gridZ < 0) { + ++nbNegGridDim; + } + if (nbNegGridDim == 1) { + int maxBlocks, nbBlocks, dev, multiProcCount; + cudaError_t err1, err2; + nbBlocks = blockDim.x * blockDim.y * blockDim.z; + cudaGetDevice(&dev); + err1 = cudaDeviceGetAttribute( + &multiProcCount, cudaDevAttrMultiProcessorCount, dev); + err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &maxBlocks, kernel, nbBlocks, smem); + if (err1 == cudaSuccess && err2 == cudaSuccess) { + maxBlocks = multiProcCount * maxBlocks; + } + if (maxBlocks > 0) { + if (gridX > 0) { + maxBlocks = maxBlocks / gridDim.x; + } + if (gridY > 0) { + maxBlocks = maxBlocks / gridDim.y; + } + if (gridZ > 0) { + maxBlocks = maxBlocks / gridDim.z; + } + if (maxBlocks < 1) { + maxBlocks = 1; + } + if (gridX < 0) { + gridDim.x = maxBlocks; + } + if (gridY < 0) { + gridDim.y = maxBlocks; + } + if (gridZ < 0) { + gridDim.z = maxBlocks; + } + } + } else if (nbNegGridDim > 1) { + Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; + terminator.Crash("Too many invalid grid dimensions"); + } + cudaStream_t stream = 0; // TODO stream managment + CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel( + kernel, gridDim, blockDim, params, smem, stream)); +} + } // extern "C" diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir index 3db2336c90a7d..0827e378c7c07 100644 --- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir +++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir @@ -131,3 +131,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, d // CHECK-LABEL: llvm.func @_QQmain() // CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1 // CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}}) + +// ----- + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + llvm.func @_QMmod1Phost_sub() { + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %2 = llvm.mlir.constant(40 : i64) : i64 + %3 = llvm.mlir.constant(16 : i32) : i32 + %4 = llvm.mlir.constant(25 : i32) : i32 + %5 = llvm.mlir.constant(21 : i32) : i32 + %6 = llvm.mlir.constant(17 : i32) : i32 + %7 = llvm.mlir.constant(1 : index) : i64 + %8 = llvm.mlir.constant(27 : i32) : i32 + %9 = llvm.mlir.constant(6 : i32) : i32 + %10 = llvm.mlir.constant(1 : i32) : i32 + %11 = llvm.mlir.constant(0 : i32) : i32 + %12 = llvm.mlir.constant(10 : index) : i64 + %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr + %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr + gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc} + llvm.return + } + llvm.func @_QMmod1Psub1(!llvm.ptr) -> () + llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> { + %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8> + llvm.return %0 : !llvm.array<2 x i8> + } + llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"} + llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"} + gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">] +} + +// CHECK-LABEL: llvm.func @_QMmod1Phost_sub() +// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel