Skip to content

Conversation

@clementval
Copy link
Contributor

Add CUFLaunchCooperativeKernel entry points and lower gpu.launch_func with grid_global attribute to this entry point.

@llvmbot llvmbot added flang:runtime flang Flang issues not falling into any other category flang:fir-hlfir labels Jan 24, 2025
@llvmbot
Copy link
Member

llvmbot commented Jan 24, 2025

@llvm/pr-subscribers-flang-runtime

Author: Valentin Clement (バレンタイン クレメン) (clementval)

Changes

Add CUFLaunchCooperativeKernel entry points and lower gpu.launch_func with grid_global attribute to this entry point.


Full diff: https://github.com/llvm/llvm-project/pull/124362.diff

4 Files Affected:

  • (modified) flang/include/flang/Runtime/CUDA/kernel.h (+4)
  • (modified) flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp (+12-6)
  • (modified) flang/runtime/CUDA/kernel.cpp (+65)
  • (modified) flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir (+35)
diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h
index 85afda09e347ae..1f812b580327af 100644
--- a/flang/include/flang/Runtime/CUDA/kernel.h
+++ b/flang/include/flang/Runtime/CUDA/kernel.h
@@ -28,6 +28,10 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
     intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
     int32_t smem, void **params, void **extra);
 
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, int32_t smem, void **params, void **extra);
+
 } // extern "C"
 
 #endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 60aa401e1cc8cc..c469b5a95b0447 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -139,20 +139,26 @@ struct GPULaunchKernelConversion
                            adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
                            dynamicMemorySize, kernelArgs, nullPtr});
     } else {
-      auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
-          RTNAME_STRING(CUFLaunchKernel));
+      auto procAttr =
+          op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
+      bool isGridGlobal =
+          procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
+      llvm::StringRef fctName = isGridGlobal
+                                    ? RTNAME_STRING(CUFLaunchCooperativeKernel)
+                                    : RTNAME_STRING(CUFLaunchKernel);
+      auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
       auto funcTy = mlir::LLVM::LLVMFunctionType::get(
           voidTy,
           {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
            llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
           /*isVarArg=*/false);
-      auto cufLaunchKernel = mlir::SymbolRefAttr::get(
-          mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
+      auto cufLaunchKernel =
+          mlir::SymbolRefAttr::get(mod.getContext(), fctName);
       if (!funcOp) {
         mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
         rewriter.setInsertionPointToStart(mod.getBody());
-        auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
-            loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
+        auto launchKernelFuncOp =
+            rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
         launchKernelFuncOp.setVisibility(
             mlir::SymbolTable::Visibility::Private);
       }
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp
index bdc04ccb17672b..02d89fb8423a5b 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang/runtime/CUDA/kernel.cpp
@@ -151,4 +151,69 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
   CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
 }
 
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, int32_t smem, void **params, void **extra) {
+  dim3 gridDim;
+  gridDim.x = gridX;
+  gridDim.y = gridY;
+  gridDim.z = gridZ;
+  dim3 blockDim;
+  blockDim.x = blockX;
+  blockDim.y = blockY;
+  blockDim.z = blockZ;
+  unsigned nbNegGridDim{0};
+  if (gridX < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridY < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridZ < 0) {
+    ++nbNegGridDim;
+  }
+  if (nbNegGridDim == 1) {
+    int maxBlocks, nbBlocks, dev, multiProcCount;
+    cudaError_t err1, err2;
+    nbBlocks = blockDim.x * blockDim.y * blockDim.z;
+    cudaGetDevice(&dev);
+    err1 = cudaDeviceGetAttribute(
+        &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+    err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &maxBlocks, kernel, nbBlocks, smem);
+    if (err1 == cudaSuccess && err2 == cudaSuccess) {
+      maxBlocks = multiProcCount * maxBlocks;
+    }
+    if (maxBlocks > 0) {
+      if (gridX > 0) {
+        maxBlocks = maxBlocks / gridDim.x;
+      }
+      if (gridY > 0) {
+        maxBlocks = maxBlocks / gridDim.y;
+      }
+      if (gridZ > 0) {
+        maxBlocks = maxBlocks / gridDim.z;
+      }
+      if (maxBlocks < 1) {
+        maxBlocks = 1;
+      }
+      if (gridX < 0) {
+        gridDim.x = maxBlocks;
+      }
+      if (gridY < 0) {
+        gridDim.y = maxBlocks;
+      }
+      if (gridZ < 0) {
+        gridDim.z = maxBlocks;
+      }
+    }
+  } else if (nbNegGridDim > 1) {
+    Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+    terminator.Crash("Too many invalid grid dimensions");
+  }
+  cudaStream_t stream = 0; // TODO stream managment
+  CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(
+      kernel, gridDim, blockDim, params, smem, stream));
+}
+
 } // extern "C"
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index 3db2336c90a7d4..0827e378c7c07e 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -131,3 +131,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
 // CHECK-LABEL: llvm.func @_QQmain()
 // CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
 // CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  llvm.func @_QMmod1Phost_sub() {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %2 = llvm.mlir.constant(40 : i64) : i64
+    %3 = llvm.mlir.constant(16 : i32) : i32
+    %4 = llvm.mlir.constant(25 : i32) : i32
+    %5 = llvm.mlir.constant(21 : i32) : i32
+    %6 = llvm.mlir.constant(17 : i32) : i32
+    %7 = llvm.mlir.constant(1 : index) : i64
+    %8 = llvm.mlir.constant(27 : i32) : i32
+    %9 = llvm.mlir.constant(6 : i32) : i32
+    %10 = llvm.mlir.constant(1 : i32) : i32
+    %11 = llvm.mlir.constant(0 : i32) : i32
+    %12 = llvm.mlir.constant(10 : index) : i64
+    %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
+    %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
+    gpu.launch_func  @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+    llvm.return
+  }
+  llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
+  llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5()  {addr_space = 0 : i32} : !llvm.array<2 x i8> {
+    %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
+    llvm.return %0 : !llvm.array<2 x i8>
+  }
+  llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
+  llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+  gpu.binary @cuda_device_mod  [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
+// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel

@llvmbot
Copy link
Member

llvmbot commented Jan 24, 2025

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

Changes

Add CUFLaunchCooperativeKernel entry points and lower gpu.launch_func with grid_global attribute to this entry point.


Full diff: https://github.com/llvm/llvm-project/pull/124362.diff

4 Files Affected:

  • (modified) flang/include/flang/Runtime/CUDA/kernel.h (+4)
  • (modified) flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp (+12-6)
  • (modified) flang/runtime/CUDA/kernel.cpp (+65)
  • (modified) flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir (+35)
diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h
index 85afda09e347ae..1f812b580327af 100644
--- a/flang/include/flang/Runtime/CUDA/kernel.h
+++ b/flang/include/flang/Runtime/CUDA/kernel.h
@@ -28,6 +28,10 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
     intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
     int32_t smem, void **params, void **extra);
 
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, int32_t smem, void **params, void **extra);
+
 } // extern "C"
 
 #endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 60aa401e1cc8cc..c469b5a95b0447 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -139,20 +139,26 @@ struct GPULaunchKernelConversion
                            adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
                            dynamicMemorySize, kernelArgs, nullPtr});
     } else {
-      auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
-          RTNAME_STRING(CUFLaunchKernel));
+      auto procAttr =
+          op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
+      bool isGridGlobal =
+          procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
+      llvm::StringRef fctName = isGridGlobal
+                                    ? RTNAME_STRING(CUFLaunchCooperativeKernel)
+                                    : RTNAME_STRING(CUFLaunchKernel);
+      auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
       auto funcTy = mlir::LLVM::LLVMFunctionType::get(
           voidTy,
           {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
            llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
           /*isVarArg=*/false);
-      auto cufLaunchKernel = mlir::SymbolRefAttr::get(
-          mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
+      auto cufLaunchKernel =
+          mlir::SymbolRefAttr::get(mod.getContext(), fctName);
       if (!funcOp) {
         mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
         rewriter.setInsertionPointToStart(mod.getBody());
-        auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
-            loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
+        auto launchKernelFuncOp =
+            rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
         launchKernelFuncOp.setVisibility(
             mlir::SymbolTable::Visibility::Private);
       }
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp
index bdc04ccb17672b..02d89fb8423a5b 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang/runtime/CUDA/kernel.cpp
@@ -151,4 +151,69 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
   CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
 }
 
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, int32_t smem, void **params, void **extra) {
+  dim3 gridDim;
+  gridDim.x = gridX;
+  gridDim.y = gridY;
+  gridDim.z = gridZ;
+  dim3 blockDim;
+  blockDim.x = blockX;
+  blockDim.y = blockY;
+  blockDim.z = blockZ;
+  unsigned nbNegGridDim{0};
+  if (gridX < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridY < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridZ < 0) {
+    ++nbNegGridDim;
+  }
+  if (nbNegGridDim == 1) {
+    int maxBlocks, nbBlocks, dev, multiProcCount;
+    cudaError_t err1, err2;
+    nbBlocks = blockDim.x * blockDim.y * blockDim.z;
+    cudaGetDevice(&dev);
+    err1 = cudaDeviceGetAttribute(
+        &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+    err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &maxBlocks, kernel, nbBlocks, smem);
+    if (err1 == cudaSuccess && err2 == cudaSuccess) {
+      maxBlocks = multiProcCount * maxBlocks;
+    }
+    if (maxBlocks > 0) {
+      if (gridX > 0) {
+        maxBlocks = maxBlocks / gridDim.x;
+      }
+      if (gridY > 0) {
+        maxBlocks = maxBlocks / gridDim.y;
+      }
+      if (gridZ > 0) {
+        maxBlocks = maxBlocks / gridDim.z;
+      }
+      if (maxBlocks < 1) {
+        maxBlocks = 1;
+      }
+      if (gridX < 0) {
+        gridDim.x = maxBlocks;
+      }
+      if (gridY < 0) {
+        gridDim.y = maxBlocks;
+      }
+      if (gridZ < 0) {
+        gridDim.z = maxBlocks;
+      }
+    }
+  } else if (nbNegGridDim > 1) {
+    Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+    terminator.Crash("Too many invalid grid dimensions");
+  }
+  cudaStream_t stream = 0; // TODO stream managment
+  CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(
+      kernel, gridDim, blockDim, params, smem, stream));
+}
+
 } // extern "C"
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index 3db2336c90a7d4..0827e378c7c07e 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -131,3 +131,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
 // CHECK-LABEL: llvm.func @_QQmain()
 // CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
 // CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  llvm.func @_QMmod1Phost_sub() {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %2 = llvm.mlir.constant(40 : i64) : i64
+    %3 = llvm.mlir.constant(16 : i32) : i32
+    %4 = llvm.mlir.constant(25 : i32) : i32
+    %5 = llvm.mlir.constant(21 : i32) : i32
+    %6 = llvm.mlir.constant(17 : i32) : i32
+    %7 = llvm.mlir.constant(1 : index) : i64
+    %8 = llvm.mlir.constant(27 : i32) : i32
+    %9 = llvm.mlir.constant(6 : i32) : i32
+    %10 = llvm.mlir.constant(1 : i32) : i32
+    %11 = llvm.mlir.constant(0 : i32) : i32
+    %12 = llvm.mlir.constant(10 : index) : i64
+    %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
+    %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
+    gpu.launch_func  @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+    llvm.return
+  }
+  llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
+  llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5()  {addr_space = 0 : i32} : !llvm.array<2 x i8> {
+    %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
+    llvm.return %0 : !llvm.array<2 x i8>
+  }
+  llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
+  llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+  gpu.binary @cuda_device_mod  [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
+// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel


void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
intptr_t blockZ, int32_t smem, void **params, void **extra) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this extra a place holder for stream?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. Stream is not handled yet. I'll add it later to the three functions when we have proper support. This extra arg is modeled on the mlir runtime entry point. It's currently unused. I would like to keep it for consistency in the 3 functions. If in the end we don't need it I'll remove it from the3 functions at the same time.

@clementval clementval merged commit 48657bf into llvm:main Jan 24, 2025
12 checks passed
@clementval clementval deleted the cuf_kernel_coop branch January 24, 2025 23:52
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

flang:fir-hlfir flang:runtime flang Flang issues not falling into any other category

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants