[flang][cuda] Handle launch of cooperative kernel #124362

clementval · 2025-01-24T22:44:01Z

Add CUFLaunchCooperativeKernel entry points and lower gpu.launch_func with grid_global attribute to this entry point.

llvmbot · 2025-01-24T22:44:35Z

@llvm/pr-subscribers-flang-runtime

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

Add CUFLaunchCooperativeKernel entry points and lower gpu.launch_func with grid_global attribute to this entry point.

Full diff: https://github.com/llvm/llvm-project/pull/124362.diff

4 Files Affected:

(modified) flang/include/flang/Runtime/CUDA/kernel.h (+4)
(modified) flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp (+12-6)
(modified) flang/runtime/CUDA/kernel.cpp (+65)
(modified) flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir (+35)

diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h
index 85afda09e347ae..1f812b580327af 100644
--- a/flang/include/flang/Runtime/CUDA/kernel.h
+++ b/flang/include/flang/Runtime/CUDA/kernel.h
@@ -28,6 +28,10 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
     intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
     int32_t smem, void **params, void **extra);
 
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, int32_t smem, void **params, void **extra);
+
 } // extern "C"
 
 #endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 60aa401e1cc8cc..c469b5a95b0447 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -139,20 +139,26 @@ struct GPULaunchKernelConversion
                            adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
                            dynamicMemorySize, kernelArgs, nullPtr});
     } else {
-      auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
-          RTNAME_STRING(CUFLaunchKernel));
+      auto procAttr =
+          op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
+      bool isGridGlobal =
+          procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
+      llvm::StringRef fctName = isGridGlobal
+                                    ? RTNAME_STRING(CUFLaunchCooperativeKernel)
+                                    : RTNAME_STRING(CUFLaunchKernel);
+      auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
       auto funcTy = mlir::LLVM::LLVMFunctionType::get(
           voidTy,
           {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
            llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
           /*isVarArg=*/false);
-      auto cufLaunchKernel = mlir::SymbolRefAttr::get(
-          mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
+      auto cufLaunchKernel =
+          mlir::SymbolRefAttr::get(mod.getContext(), fctName);
       if (!funcOp) {
         mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
         rewriter.setInsertionPointToStart(mod.getBody());
-        auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
-            loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
+        auto launchKernelFuncOp =
+            rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
         launchKernelFuncOp.setVisibility(
             mlir::SymbolTable::Visibility::Private);
       }
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp
index bdc04ccb17672b..02d89fb8423a5b 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang/runtime/CUDA/kernel.cpp
@@ -151,4 +151,69 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
   CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
 }
 
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, int32_t smem, void **params, void **extra) {
+  dim3 gridDim;
+  gridDim.x = gridX;
+  gridDim.y = gridY;
+  gridDim.z = gridZ;
+  dim3 blockDim;
+  blockDim.x = blockX;
+  blockDim.y = blockY;
+  blockDim.z = blockZ;
+  unsigned nbNegGridDim{0};
+  if (gridX < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridY < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridZ < 0) {
+    ++nbNegGridDim;
+  }
+  if (nbNegGridDim == 1) {
+    int maxBlocks, nbBlocks, dev, multiProcCount;
+    cudaError_t err1, err2;
+    nbBlocks = blockDim.x * blockDim.y * blockDim.z;
+    cudaGetDevice(&dev);
+    err1 = cudaDeviceGetAttribute(
+        &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+    err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &maxBlocks, kernel, nbBlocks, smem);
+    if (err1 == cudaSuccess && err2 == cudaSuccess) {
+      maxBlocks = multiProcCount * maxBlocks;
+    }
+    if (maxBlocks > 0) {
+      if (gridX > 0) {
+        maxBlocks = maxBlocks / gridDim.x;
+      }
+      if (gridY > 0) {
+        maxBlocks = maxBlocks / gridDim.y;
+      }
+      if (gridZ > 0) {
+        maxBlocks = maxBlocks / gridDim.z;
+      }
+      if (maxBlocks < 1) {
+        maxBlocks = 1;
+      }
+      if (gridX < 0) {
+        gridDim.x = maxBlocks;
+      }
+      if (gridY < 0) {
+        gridDim.y = maxBlocks;
+      }
+      if (gridZ < 0) {
+        gridDim.z = maxBlocks;
+      }
+    }
+  } else if (nbNegGridDim > 1) {
+    Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+    terminator.Crash("Too many invalid grid dimensions");
+  }
+  cudaStream_t stream = 0; // TODO stream managment
+  CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(
+      kernel, gridDim, blockDim, params, smem, stream));
+}
+
 } // extern "C"
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index 3db2336c90a7d4..0827e378c7c07e 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -131,3 +131,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
 // CHECK-LABEL: llvm.func @_QQmain()
 // CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
 // CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  llvm.func @_QMmod1Phost_sub() {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %2 = llvm.mlir.constant(40 : i64) : i64
+    %3 = llvm.mlir.constant(16 : i32) : i32
+    %4 = llvm.mlir.constant(25 : i32) : i32
+    %5 = llvm.mlir.constant(21 : i32) : i32
+    %6 = llvm.mlir.constant(17 : i32) : i32
+    %7 = llvm.mlir.constant(1 : index) : i64
+    %8 = llvm.mlir.constant(27 : i32) : i32
+    %9 = llvm.mlir.constant(6 : i32) : i32
+    %10 = llvm.mlir.constant(1 : i32) : i32
+    %11 = llvm.mlir.constant(0 : i32) : i32
+    %12 = llvm.mlir.constant(10 : index) : i64
+    %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
+    %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
+    gpu.launch_func  @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+    llvm.return
+  }
+  llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
+  llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5()  {addr_space = 0 : i32} : !llvm.array<2 x i8> {
+    %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
+    llvm.return %0 : !llvm.array<2 x i8>
+  }
+  llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
+  llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+  gpu.binary @cuda_device_mod  [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
+// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel

llvmbot · 2025-01-24T22:44:36Z

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

Add CUFLaunchCooperativeKernel entry points and lower gpu.launch_func with grid_global attribute to this entry point.

Full diff: https://github.com/llvm/llvm-project/pull/124362.diff

4 Files Affected:

(modified) flang/include/flang/Runtime/CUDA/kernel.h (+4)
(modified) flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp (+12-6)
(modified) flang/runtime/CUDA/kernel.cpp (+65)
(modified) flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir (+35)

diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h
index 85afda09e347ae..1f812b580327af 100644
--- a/flang/include/flang/Runtime/CUDA/kernel.h
+++ b/flang/include/flang/Runtime/CUDA/kernel.h
@@ -28,6 +28,10 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
     intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
     int32_t smem, void **params, void **extra);
 
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, int32_t smem, void **params, void **extra);
+
 } // extern "C"
 
 #endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 60aa401e1cc8cc..c469b5a95b0447 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -139,20 +139,26 @@ struct GPULaunchKernelConversion
                            adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
                            dynamicMemorySize, kernelArgs, nullPtr});
     } else {
-      auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
-          RTNAME_STRING(CUFLaunchKernel));
+      auto procAttr =
+          op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
+      bool isGridGlobal =
+          procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
+      llvm::StringRef fctName = isGridGlobal
+                                    ? RTNAME_STRING(CUFLaunchCooperativeKernel)
+                                    : RTNAME_STRING(CUFLaunchKernel);
+      auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
       auto funcTy = mlir::LLVM::LLVMFunctionType::get(
           voidTy,
           {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
            llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
           /*isVarArg=*/false);
-      auto cufLaunchKernel = mlir::SymbolRefAttr::get(
-          mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
+      auto cufLaunchKernel =
+          mlir::SymbolRefAttr::get(mod.getContext(), fctName);
       if (!funcOp) {
         mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
         rewriter.setInsertionPointToStart(mod.getBody());
-        auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
-            loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
+        auto launchKernelFuncOp =
+            rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
         launchKernelFuncOp.setVisibility(
             mlir::SymbolTable::Visibility::Private);
       }
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp
index bdc04ccb17672b..02d89fb8423a5b 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang/runtime/CUDA/kernel.cpp
@@ -151,4 +151,69 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
   CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
 }
 
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, int32_t smem, void **params, void **extra) {
+  dim3 gridDim;
+  gridDim.x = gridX;
+  gridDim.y = gridY;
+  gridDim.z = gridZ;
+  dim3 blockDim;
+  blockDim.x = blockX;
+  blockDim.y = blockY;
+  blockDim.z = blockZ;
+  unsigned nbNegGridDim{0};
+  if (gridX < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridY < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridZ < 0) {
+    ++nbNegGridDim;
+  }
+  if (nbNegGridDim == 1) {
+    int maxBlocks, nbBlocks, dev, multiProcCount;
+    cudaError_t err1, err2;
+    nbBlocks = blockDim.x * blockDim.y * blockDim.z;
+    cudaGetDevice(&dev);
+    err1 = cudaDeviceGetAttribute(
+        &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+    err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &maxBlocks, kernel, nbBlocks, smem);
+    if (err1 == cudaSuccess && err2 == cudaSuccess) {
+      maxBlocks = multiProcCount * maxBlocks;
+    }
+    if (maxBlocks > 0) {
+      if (gridX > 0) {
+        maxBlocks = maxBlocks / gridDim.x;
+      }
+      if (gridY > 0) {
+        maxBlocks = maxBlocks / gridDim.y;
+      }
+      if (gridZ > 0) {
+        maxBlocks = maxBlocks / gridDim.z;
+      }
+      if (maxBlocks < 1) {
+        maxBlocks = 1;
+      }
+      if (gridX < 0) {
+        gridDim.x = maxBlocks;
+      }
+      if (gridY < 0) {
+        gridDim.y = maxBlocks;
+      }
+      if (gridZ < 0) {
+        gridDim.z = maxBlocks;
+      }
+    }
+  } else if (nbNegGridDim > 1) {
+    Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+    terminator.Crash("Too many invalid grid dimensions");
+  }
+  cudaStream_t stream = 0; // TODO stream managment
+  CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(
+      kernel, gridDim, blockDim, params, smem, stream));
+}
+
 } // extern "C"
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index 3db2336c90a7d4..0827e378c7c07e 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -131,3 +131,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
 // CHECK-LABEL: llvm.func @_QQmain()
 // CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
 // CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  llvm.func @_QMmod1Phost_sub() {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %2 = llvm.mlir.constant(40 : i64) : i64
+    %3 = llvm.mlir.constant(16 : i32) : i32
+    %4 = llvm.mlir.constant(25 : i32) : i32
+    %5 = llvm.mlir.constant(21 : i32) : i32
+    %6 = llvm.mlir.constant(17 : i32) : i32
+    %7 = llvm.mlir.constant(1 : index) : i64
+    %8 = llvm.mlir.constant(27 : i32) : i32
+    %9 = llvm.mlir.constant(6 : i32) : i32
+    %10 = llvm.mlir.constant(1 : i32) : i32
+    %11 = llvm.mlir.constant(0 : i32) : i32
+    %12 = llvm.mlir.constant(10 : index) : i64
+    %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
+    %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
+    gpu.launch_func  @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+    llvm.return
+  }
+  llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
+  llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5()  {addr_space = 0 : i32} : !llvm.array<2 x i8> {
+    %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
+    llvm.return %0 : !llvm.array<2 x i8>
+  }
+  llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
+  llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+  gpu.binary @cuda_device_mod  [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
+// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel

wangzpgi · 2025-01-24T23:25:10Z

flang/runtime/CUDA/kernel.cpp


+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, int32_t smem, void **params, void **extra) {


Is this extra a place holder for stream?

No. Stream is not handled yet. I'll add it later to the three functions when we have proper support. This extra arg is modeled on the mlir runtime entry point. It's currently unused. I would like to keep it for consistency in the 3 functions. If in the end we don't need it I'll remove it from the3 functions at the same time.

[flang][cuda] Handle launch of cooperative kernel

557683b

clementval requested review from Renaud-K and wangzpgi January 24, 2025 22:44

llvmbot added flang:runtime flang Flang issues not falling into any other category flang:fir-hlfir labels Jan 24, 2025

wangzpgi reviewed Jan 24, 2025

View reviewed changes

wangzpgi approved these changes Jan 24, 2025

View reviewed changes

clementval merged commit 48657bf into llvm:main Jan 24, 2025
12 checks passed

clementval deleted the cuf_kernel_coop branch January 24, 2025 23:52

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[flang][cuda] Handle launch of cooperative kernel #124362

[flang][cuda] Handle launch of cooperative kernel #124362

Uh oh!

clementval commented Jan 24, 2025

Uh oh!

llvmbot commented Jan 24, 2025

Uh oh!

llvmbot commented Jan 24, 2025

Uh oh!

wangzpgi Jan 24, 2025

Uh oh!

clementval Jan 24, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[flang][cuda] Handle launch of cooperative kernel #124362

[flang][cuda] Handle launch of cooperative kernel #124362

Uh oh!

Conversation

clementval commented Jan 24, 2025

Uh oh!

llvmbot commented Jan 24, 2025

Uh oh!

llvmbot commented Jan 24, 2025

Uh oh!

wangzpgi Jan 24, 2025

Choose a reason for hiding this comment

Uh oh!

clementval Jan 24, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants