From 0885d06aff81b5f46d8133c7e0639a605e8fdf75 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Thu, 17 Apr 2025 16:04:20 -0700 Subject: [PATCH 1/3] [flang][cuda] Only convert launch from CUDA Fortran kernels --- .../Transforms/CUFGPUToLLVMConversion.cpp | 5 +++++ .../lib/Optimizer/Transforms/CUFOpConversion.cpp | 5 +++++ flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir | 4 ++-- flang/test/Fir/CUDA/cuda-launch.fir | 15 ++++++++++----- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp index 02b4e6a5a469c..f22e44fac306b 100644 --- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp @@ -82,6 +82,11 @@ struct GPULaunchKernelConversion mlir::LogicalResult matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { + // Only convert gpu.launch_func for CUDA Fortran. + if (!op.getOperation()->getAttrOfType( + cuf::getProcAttrName())) + return mlir::failure(); + mlir::Location loc = op.getLoc(); auto *ctx = rewriter.getContext(); mlir::ModuleOp mod = op->getParentOfType(); diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 77364cb837c3c..e70ceb3a67d98 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -888,6 +888,11 @@ struct CUFLaunchOpConversion } if (procAttr) gpuLaunchOp->setAttr(cuf::getProcAttrName(), procAttr); + else + // Set default global attribute of the original was not found. + gpuLaunchOp->setAttr(cuf::getProcAttrName(), + cuf::ProcAttributeAttr::get( + op.getContext(), cuf::ProcAttribute::Global)); rewriter.replaceOp(op, gpuLaunchOp); return mlir::success(); } diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir index 0319213016e45..377bf8d9700f8 100644 --- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir +++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir @@ -54,7 +54,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : ve llvm.br ^bb1(%44 : i64) ^bb3: // pred: ^bb1 %45 = llvm.call @_FortranACUFDataTransferPtrPtr(%14, %25, %2, %11, %13, %5) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()> - gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) + gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc} %46 = llvm.call @_FortranACUFDataTransferPtrPtr(%25, %14, %2, %10, %13, %4) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()> %47 = llvm.call @_FortranAioBeginExternalListOutput(%9, %13, %8) {fastmathFlags = #llvm.fastmath} : (i32, !llvm.ptr, i32) -> !llvm.ptr %48 = llvm.mlir.constant(9 : i32) : i32 @@ -122,7 +122,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, d %1 = llvm.mlir.constant(2 : index) : i64 %2 = llvm.mlir.constant(0 : i32) : i32 %3 = llvm.mlir.constant(10 : index) : i64 - gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%1, %1, %0) blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2 + gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%1, %1, %0) blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2 {cuf.proc_attr = #cuf.cuda_proc} llvm.return } gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">] diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir index 028279832c703..ef458dc6b481b 100644 --- a/flang/test/Fir/CUDA/cuda-launch.fir +++ b/flang/test/Fir/CUDA/cuda-launch.fir @@ -26,13 +26,13 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e %c1024_i32 = arith.constant 1024 : i32 %c6_i32 = arith.constant 6 : i32 %c1_i32 = arith.constant 1 : i32 - // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} + // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} {cuf.proc_attr = #cuf.cuda_proc} cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>() - // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c1024{{.*}} + // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c1024{{.*}} {cuf.proc_attr = #cuf.cuda_proc} cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1024_i32>>>() - // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref) + // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref) {cuf.proc_attr = #cuf.cuda_proc} cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref) return } @@ -64,7 +64,7 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e } // CHECK-LABEL: func.func @_QMmod1Phost_sub() -// CHECK: gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}}) +// CHECK: gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}}) {cuf.proc_attr = #cuf.cuda_proc} // ----- @@ -107,7 +107,7 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e // CHECK: %[[CONV_ADDR:.*]] = fir.convert %[[ADDROF]] : (!fir.ref>>>) -> !fir.llvm_ptr // CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_ADDR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr // CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr) -> !fir.ref>>> -// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref>>>) +// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref>>>) {cuf.proc_attr = #cuf.cuda_proc} // ----- @@ -154,5 +154,10 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e // CHECK-LABEL: func.func @_QQmain() // CHECK: %[[STREAM:.*]] = fir.alloca i64 {bindc_name = "stream", uniq_name = "_QMtest_callFhostEstream"} // CHECK: %[[DECL_STREAM:.*]]:2 = hlfir.declare %[[STREAM]] {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref) -> (!fir.ref, !fir.ref) +<<<<<<< HEAD // CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : !fir.ref // CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest +======= +// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : +// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} {cuf.proc_attr = #cuf.cuda_proc} +>>>>>>> 9075a18bf3c4 ([flang][cuda] Only convert launch from CUDA Fortran kernels) From 1156179660cebecbe9a69cc7528388f3738833ba Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Mon, 21 Apr 2025 09:36:34 -0700 Subject: [PATCH 2/3] Fix tests --- .../Optimizer/Transforms/CUFGPUToLLVMConversion.cpp | 10 +++++++++- flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir | 2 +- flang/test/Fir/CUDA/cuda-launch.fir | 5 ----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp index f22e44fac306b..60264fec2afb0 100644 --- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp @@ -298,7 +298,15 @@ class CUFGPUToLLVMConversion fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false, /*forceUnifiedTBAATree=*/false, *dl); cuf::populateCUFGPUToLLVMConversionPatterns(typeConverter, patterns); - target.addIllegalOp(); + + target.addDynamicallyLegalOp([&](mlir::gpu::LaunchFuncOp op) { + if (op.getOperation()->getAttrOfType( + cuf::getProcAttrName())) + return false; + return true; + }); + + target.addIllegalOp(); target.addLegalDialect(); if (mlir::failed(mlir::applyPartialConversion(getOperation(), target, diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir index 377bf8d9700f8..a2f89d822c455 100644 --- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir +++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir @@ -180,7 +180,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, d %2 = llvm.mlir.constant(0 : i32) : i32 %3 = llvm.mlir.constant(10 : index) : i64 %token = cuf.stream_cast %stream : !llvm.ptr - gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2 + gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2 {cuf.proc_attr = #cuf.cuda_proc} llvm.return } gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">] diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir index ef458dc6b481b..1e8036e628650 100644 --- a/flang/test/Fir/CUDA/cuda-launch.fir +++ b/flang/test/Fir/CUDA/cuda-launch.fir @@ -154,10 +154,5 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e // CHECK-LABEL: func.func @_QQmain() // CHECK: %[[STREAM:.*]] = fir.alloca i64 {bindc_name = "stream", uniq_name = "_QMtest_callFhostEstream"} // CHECK: %[[DECL_STREAM:.*]]:2 = hlfir.declare %[[STREAM]] {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref) -> (!fir.ref, !fir.ref) -<<<<<<< HEAD // CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : !fir.ref // CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest -======= -// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : -// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} {cuf.proc_attr = #cuf.cuda_proc} ->>>>>>> 9075a18bf3c4 ([flang][cuda] Only convert launch from CUDA Fortran kernels) From c9176ef22caad6335882b0c4bffc64ef4f9a818d Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Mon, 21 Apr 2025 09:41:04 -0700 Subject: [PATCH 3/3] clang-format --- .../Transforms/CUFGPUToLLVMConversion.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp index 60264fec2afb0..ef6b80b5739c6 100644 --- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp @@ -298,14 +298,14 @@ class CUFGPUToLLVMConversion fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false, /*forceUnifiedTBAATree=*/false, *dl); cuf::populateCUFGPUToLLVMConversionPatterns(typeConverter, patterns); - - target.addDynamicallyLegalOp([&](mlir::gpu::LaunchFuncOp op) { - if (op.getOperation()->getAttrOfType( - cuf::getProcAttrName())) - return false; - return true; - }); + target.addDynamicallyLegalOp( + [&](mlir::gpu::LaunchFuncOp op) { + if (op.getOperation()->getAttrOfType( + cuf::getProcAttrName())) + return false; + return true; + }); target.addIllegalOp(); target.addLegalDialect();