diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp index 0776346870c72..8ca2869993443 100644 --- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp +++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp @@ -143,7 +143,8 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { llvm::SmallVector operandsTypes; for (auto arg : gpuLaunchFunc.getKernelOperands()) operandsTypes.push_back(arg.getType()); - auto fctTy = mlir::FunctionType::get(&context, operandsTypes, {}); + auto fctTy = mlir::FunctionType::get(&context, operandsTypes, + gpuLaunchFunc.getResultTypes()); if (!hasPortableSignature(fctTy, op)) convertCallOp(gpuLaunchFunc, fctTy); } else if (auto addr = mlir::dyn_cast(op)) { @@ -520,10 +521,14 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { llvm::SmallVector newCallResults; // TODO propagate/update call argument and result attributes. if constexpr (std::is_same_v, mlir::gpu::LaunchFuncOp>) { + mlir::Value asyncToken = callOp.getAsyncToken(); auto newCall = A::create(*rewriter, loc, callOp.getKernel(), callOp.getGridSizeOperandValues(), callOp.getBlockSizeOperandValues(), - callOp.getDynamicSharedMemorySize(), newOpers); + callOp.getDynamicSharedMemorySize(), newOpers, + asyncToken ? asyncToken.getType() : nullptr, + callOp.getAsyncDependencies(), + /*clusterSize=*/std::nullopt); if (callOp.getClusterSizeX()) newCall.getClusterSizeXMutable().assign(callOp.getClusterSizeX()); if (callOp.getClusterSizeY()) diff --git a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir index 48fee10f3db97..5562e00085526 100644 --- a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir +++ b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir @@ -108,3 +108,23 @@ module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.k } } +// ----- + +module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + gpu.module @testmod { + gpu.func @_QPtest(%arg0: complex) -> () kernel { + gpu.return + } + } + func.func @main(%arg0: complex) { + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.mlir.constant(0 : i32) : i32 + %2 = fir.alloca i64 + %3 = cuf.stream_cast %2 : !fir.ref + %4 = gpu.launch_func async [%3] @testmod::@_QPtest blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %1 args(%arg0 : complex) {cuf.proc_attr = #cuf.cuda_proc} + return + } +} + +// CHECK-LABEL: func.func @main +// CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @testmod::@_QPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 dynamic_shared_memory_size %{{.*}} args(%{{.*}} : !fir.vector<2:f32>) {cuf.proc_attr = #cuf.cuda_proc}