diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp index 02b4e6a5a469c..ef6b80b5739c6 100644 --- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp @@ -82,6 +82,11 @@ struct GPULaunchKernelConversion mlir::LogicalResult matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { + // Only convert gpu.launch_func for CUDA Fortran. + if (!op.getOperation()->getAttrOfType( + cuf::getProcAttrName())) + return mlir::failure(); + mlir::Location loc = op.getLoc(); auto *ctx = rewriter.getContext(); mlir::ModuleOp mod = op->getParentOfType(); @@ -293,7 +298,15 @@ class CUFGPUToLLVMConversion fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false, /*forceUnifiedTBAATree=*/false, *dl); cuf::populateCUFGPUToLLVMConversionPatterns(typeConverter, patterns); - target.addIllegalOp(); + + target.addDynamicallyLegalOp( + [&](mlir::gpu::LaunchFuncOp op) { + if (op.getOperation()->getAttrOfType( + cuf::getProcAttrName())) + return false; + return true; + }); + target.addIllegalOp(); target.addLegalDialect(); if (mlir::failed(mlir::applyPartialConversion(getOperation(), target, diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 77364cb837c3c..e70ceb3a67d98 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -888,6 +888,11 @@ struct CUFLaunchOpConversion } if (procAttr) gpuLaunchOp->setAttr(cuf::getProcAttrName(), procAttr); + else + // Set default global attribute of the original was not found. + gpuLaunchOp->setAttr(cuf::getProcAttrName(), + cuf::ProcAttributeAttr::get( + op.getContext(), cuf::ProcAttribute::Global)); rewriter.replaceOp(op, gpuLaunchOp); return mlir::success(); } diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir index 0319213016e45..a2f89d822c455 100644 --- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir +++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir @@ -54,7 +54,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : ve llvm.br ^bb1(%44 : i64) ^bb3: // pred: ^bb1 %45 = llvm.call @_FortranACUFDataTransferPtrPtr(%14, %25, %2, %11, %13, %5) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()> - gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) + gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc} %46 = llvm.call @_FortranACUFDataTransferPtrPtr(%25, %14, %2, %10, %13, %4) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()> %47 = llvm.call @_FortranAioBeginExternalListOutput(%9, %13, %8) {fastmathFlags = #llvm.fastmath} : (i32, !llvm.ptr, i32) -> !llvm.ptr %48 = llvm.mlir.constant(9 : i32) : i32 @@ -122,7 +122,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, d %1 = llvm.mlir.constant(2 : index) : i64 %2 = llvm.mlir.constant(0 : i32) : i32 %3 = llvm.mlir.constant(10 : index) : i64 - gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%1, %1, %0) blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2 + gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%1, %1, %0) blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2 {cuf.proc_attr = #cuf.cuda_proc} llvm.return } gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">] @@ -180,7 +180,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, d %2 = llvm.mlir.constant(0 : i32) : i32 %3 = llvm.mlir.constant(10 : index) : i64 %token = cuf.stream_cast %stream : !llvm.ptr - gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2 + gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2 {cuf.proc_attr = #cuf.cuda_proc} llvm.return } gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">] diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir index 028279832c703..1e8036e628650 100644 --- a/flang/test/Fir/CUDA/cuda-launch.fir +++ b/flang/test/Fir/CUDA/cuda-launch.fir @@ -26,13 +26,13 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e %c1024_i32 = arith.constant 1024 : i32 %c6_i32 = arith.constant 6 : i32 %c1_i32 = arith.constant 1 : i32 - // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} + // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} {cuf.proc_attr = #cuf.cuda_proc} cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>() - // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c1024{{.*}} + // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c1024{{.*}} {cuf.proc_attr = #cuf.cuda_proc} cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1024_i32>>>() - // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref) + // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref) {cuf.proc_attr = #cuf.cuda_proc} cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref) return } @@ -64,7 +64,7 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e } // CHECK-LABEL: func.func @_QMmod1Phost_sub() -// CHECK: gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}}) +// CHECK: gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}}) {cuf.proc_attr = #cuf.cuda_proc} // ----- @@ -107,7 +107,7 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e // CHECK: %[[CONV_ADDR:.*]] = fir.convert %[[ADDROF]] : (!fir.ref>>>) -> !fir.llvm_ptr // CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_ADDR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr // CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr) -> !fir.ref>>> -// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref>>>) +// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref>>>) {cuf.proc_attr = #cuf.cuda_proc} // -----