-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[flang][cuda] Handle launch of cooperative kernel #124362
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-flang-runtime Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesAdd Full diff: https://github.com/llvm/llvm-project/pull/124362.diff 4 Files Affected:
diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h
index 85afda09e347ae..1f812b580327af 100644
--- a/flang/include/flang/Runtime/CUDA/kernel.h
+++ b/flang/include/flang/Runtime/CUDA/kernel.h
@@ -28,6 +28,10 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
int32_t smem, void **params, void **extra);
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
+ intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+ intptr_t blockZ, int32_t smem, void **params, void **extra);
+
} // extern "C"
#endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 60aa401e1cc8cc..c469b5a95b0447 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -139,20 +139,26 @@ struct GPULaunchKernelConversion
adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
dynamicMemorySize, kernelArgs, nullPtr});
} else {
- auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
- RTNAME_STRING(CUFLaunchKernel));
+ auto procAttr =
+ op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
+ bool isGridGlobal =
+ procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
+ llvm::StringRef fctName = isGridGlobal
+ ? RTNAME_STRING(CUFLaunchCooperativeKernel)
+ : RTNAME_STRING(CUFLaunchKernel);
+ auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
/*isVarArg=*/false);
- auto cufLaunchKernel = mlir::SymbolRefAttr::get(
- mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
+ auto cufLaunchKernel =
+ mlir::SymbolRefAttr::get(mod.getContext(), fctName);
if (!funcOp) {
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
rewriter.setInsertionPointToStart(mod.getBody());
- auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
- loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
+ auto launchKernelFuncOp =
+ rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
launchKernelFuncOp.setVisibility(
mlir::SymbolTable::Visibility::Private);
}
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp
index bdc04ccb17672b..02d89fb8423a5b 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang/runtime/CUDA/kernel.cpp
@@ -151,4 +151,69 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
}
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
+ intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+ intptr_t blockZ, int32_t smem, void **params, void **extra) {
+ dim3 gridDim;
+ gridDim.x = gridX;
+ gridDim.y = gridY;
+ gridDim.z = gridZ;
+ dim3 blockDim;
+ blockDim.x = blockX;
+ blockDim.y = blockY;
+ blockDim.z = blockZ;
+ unsigned nbNegGridDim{0};
+ if (gridX < 0) {
+ ++nbNegGridDim;
+ }
+ if (gridY < 0) {
+ ++nbNegGridDim;
+ }
+ if (gridZ < 0) {
+ ++nbNegGridDim;
+ }
+ if (nbNegGridDim == 1) {
+ int maxBlocks, nbBlocks, dev, multiProcCount;
+ cudaError_t err1, err2;
+ nbBlocks = blockDim.x * blockDim.y * blockDim.z;
+ cudaGetDevice(&dev);
+ err1 = cudaDeviceGetAttribute(
+ &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+ err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+ &maxBlocks, kernel, nbBlocks, smem);
+ if (err1 == cudaSuccess && err2 == cudaSuccess) {
+ maxBlocks = multiProcCount * maxBlocks;
+ }
+ if (maxBlocks > 0) {
+ if (gridX > 0) {
+ maxBlocks = maxBlocks / gridDim.x;
+ }
+ if (gridY > 0) {
+ maxBlocks = maxBlocks / gridDim.y;
+ }
+ if (gridZ > 0) {
+ maxBlocks = maxBlocks / gridDim.z;
+ }
+ if (maxBlocks < 1) {
+ maxBlocks = 1;
+ }
+ if (gridX < 0) {
+ gridDim.x = maxBlocks;
+ }
+ if (gridY < 0) {
+ gridDim.y = maxBlocks;
+ }
+ if (gridZ < 0) {
+ gridDim.z = maxBlocks;
+ }
+ }
+ } else if (nbNegGridDim > 1) {
+ Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+ terminator.Crash("Too many invalid grid dimensions");
+ }
+ cudaStream_t stream = 0; // TODO stream managment
+ CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(
+ kernel, gridDim, blockDim, params, smem, stream));
+}
+
} // extern "C"
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index 3db2336c90a7d4..0827e378c7c07e 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -131,3 +131,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
// CHECK-LABEL: llvm.func @_QQmain()
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
// CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ llvm.func @_QMmod1Phost_sub() {
+ %0 = llvm.mlir.constant(1 : i32) : i32
+ %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+ %2 = llvm.mlir.constant(40 : i64) : i64
+ %3 = llvm.mlir.constant(16 : i32) : i32
+ %4 = llvm.mlir.constant(25 : i32) : i32
+ %5 = llvm.mlir.constant(21 : i32) : i32
+ %6 = llvm.mlir.constant(17 : i32) : i32
+ %7 = llvm.mlir.constant(1 : index) : i64
+ %8 = llvm.mlir.constant(27 : i32) : i32
+ %9 = llvm.mlir.constant(6 : i32) : i32
+ %10 = llvm.mlir.constant(1 : i32) : i32
+ %11 = llvm.mlir.constant(0 : i32) : i32
+ %12 = llvm.mlir.constant(10 : index) : i64
+ %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
+ %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
+ gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+ llvm.return
+ }
+ llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
+ llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> {
+ %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
+ llvm.return %0 : !llvm.array<2 x i8>
+ }
+ llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
+ llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+ gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
+// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel
|
|
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesAdd Full diff: https://github.com/llvm/llvm-project/pull/124362.diff 4 Files Affected:
diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h
index 85afda09e347ae..1f812b580327af 100644
--- a/flang/include/flang/Runtime/CUDA/kernel.h
+++ b/flang/include/flang/Runtime/CUDA/kernel.h
@@ -28,6 +28,10 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
int32_t smem, void **params, void **extra);
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
+ intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+ intptr_t blockZ, int32_t smem, void **params, void **extra);
+
} // extern "C"
#endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 60aa401e1cc8cc..c469b5a95b0447 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -139,20 +139,26 @@ struct GPULaunchKernelConversion
adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
dynamicMemorySize, kernelArgs, nullPtr});
} else {
- auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
- RTNAME_STRING(CUFLaunchKernel));
+ auto procAttr =
+ op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
+ bool isGridGlobal =
+ procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
+ llvm::StringRef fctName = isGridGlobal
+ ? RTNAME_STRING(CUFLaunchCooperativeKernel)
+ : RTNAME_STRING(CUFLaunchKernel);
+ auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
/*isVarArg=*/false);
- auto cufLaunchKernel = mlir::SymbolRefAttr::get(
- mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
+ auto cufLaunchKernel =
+ mlir::SymbolRefAttr::get(mod.getContext(), fctName);
if (!funcOp) {
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
rewriter.setInsertionPointToStart(mod.getBody());
- auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
- loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
+ auto launchKernelFuncOp =
+ rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
launchKernelFuncOp.setVisibility(
mlir::SymbolTable::Visibility::Private);
}
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp
index bdc04ccb17672b..02d89fb8423a5b 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang/runtime/CUDA/kernel.cpp
@@ -151,4 +151,69 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
}
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
+ intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+ intptr_t blockZ, int32_t smem, void **params, void **extra) {
+ dim3 gridDim;
+ gridDim.x = gridX;
+ gridDim.y = gridY;
+ gridDim.z = gridZ;
+ dim3 blockDim;
+ blockDim.x = blockX;
+ blockDim.y = blockY;
+ blockDim.z = blockZ;
+ unsigned nbNegGridDim{0};
+ if (gridX < 0) {
+ ++nbNegGridDim;
+ }
+ if (gridY < 0) {
+ ++nbNegGridDim;
+ }
+ if (gridZ < 0) {
+ ++nbNegGridDim;
+ }
+ if (nbNegGridDim == 1) {
+ int maxBlocks, nbBlocks, dev, multiProcCount;
+ cudaError_t err1, err2;
+ nbBlocks = blockDim.x * blockDim.y * blockDim.z;
+ cudaGetDevice(&dev);
+ err1 = cudaDeviceGetAttribute(
+ &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+ err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+ &maxBlocks, kernel, nbBlocks, smem);
+ if (err1 == cudaSuccess && err2 == cudaSuccess) {
+ maxBlocks = multiProcCount * maxBlocks;
+ }
+ if (maxBlocks > 0) {
+ if (gridX > 0) {
+ maxBlocks = maxBlocks / gridDim.x;
+ }
+ if (gridY > 0) {
+ maxBlocks = maxBlocks / gridDim.y;
+ }
+ if (gridZ > 0) {
+ maxBlocks = maxBlocks / gridDim.z;
+ }
+ if (maxBlocks < 1) {
+ maxBlocks = 1;
+ }
+ if (gridX < 0) {
+ gridDim.x = maxBlocks;
+ }
+ if (gridY < 0) {
+ gridDim.y = maxBlocks;
+ }
+ if (gridZ < 0) {
+ gridDim.z = maxBlocks;
+ }
+ }
+ } else if (nbNegGridDim > 1) {
+ Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+ terminator.Crash("Too many invalid grid dimensions");
+ }
+ cudaStream_t stream = 0; // TODO stream managment
+ CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(
+ kernel, gridDim, blockDim, params, smem, stream));
+}
+
} // extern "C"
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index 3db2336c90a7d4..0827e378c7c07e 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -131,3 +131,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
// CHECK-LABEL: llvm.func @_QQmain()
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
// CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ llvm.func @_QMmod1Phost_sub() {
+ %0 = llvm.mlir.constant(1 : i32) : i32
+ %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+ %2 = llvm.mlir.constant(40 : i64) : i64
+ %3 = llvm.mlir.constant(16 : i32) : i32
+ %4 = llvm.mlir.constant(25 : i32) : i32
+ %5 = llvm.mlir.constant(21 : i32) : i32
+ %6 = llvm.mlir.constant(17 : i32) : i32
+ %7 = llvm.mlir.constant(1 : index) : i64
+ %8 = llvm.mlir.constant(27 : i32) : i32
+ %9 = llvm.mlir.constant(6 : i32) : i32
+ %10 = llvm.mlir.constant(1 : i32) : i32
+ %11 = llvm.mlir.constant(0 : i32) : i32
+ %12 = llvm.mlir.constant(10 : index) : i64
+ %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
+ %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
+ gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+ llvm.return
+ }
+ llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
+ llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> {
+ %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
+ llvm.return %0 : !llvm.array<2 x i8>
+ }
+ llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
+ llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+ gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
+// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel
|
|
|
||
| void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX, | ||
| intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, | ||
| intptr_t blockZ, int32_t smem, void **params, void **extra) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this extra a place holder for stream?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No. Stream is not handled yet. I'll add it later to the three functions when we have proper support. This extra arg is modeled on the mlir runtime entry point. It's currently unused. I would like to keep it for consistency in the 3 functions. If in the end we don't need it I'll remove it from the3 functions at the same time.
Add
CUFLaunchCooperativeKernelentry points and lower gpu.launch_func with grid_global attribute to this entry point.