Skip to content

Commit 0589409

Browse files
authored
[flang][cuda] Support gpu.launch_func with async token in target rewrite pass (llvm#165485)
1 parent 3fc24a2 commit 0589409

File tree

2 files changed

+27
-2
lines changed

2 files changed

+27
-2
lines changed

flang/lib/Optimizer/CodeGen/TargetRewrite.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,8 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
143143
llvm::SmallVector<mlir::Type> operandsTypes;
144144
for (auto arg : gpuLaunchFunc.getKernelOperands())
145145
operandsTypes.push_back(arg.getType());
146-
auto fctTy = mlir::FunctionType::get(&context, operandsTypes, {});
146+
auto fctTy = mlir::FunctionType::get(&context, operandsTypes,
147+
gpuLaunchFunc.getResultTypes());
147148
if (!hasPortableSignature(fctTy, op))
148149
convertCallOp(gpuLaunchFunc, fctTy);
149150
} else if (auto addr = mlir::dyn_cast<fir::AddrOfOp>(op)) {
@@ -520,10 +521,14 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
520521
llvm::SmallVector<mlir::Value, 1> newCallResults;
521522
// TODO propagate/update call argument and result attributes.
522523
if constexpr (std::is_same_v<std::decay_t<A>, mlir::gpu::LaunchFuncOp>) {
524+
mlir::Value asyncToken = callOp.getAsyncToken();
523525
auto newCall = A::create(*rewriter, loc, callOp.getKernel(),
524526
callOp.getGridSizeOperandValues(),
525527
callOp.getBlockSizeOperandValues(),
526-
callOp.getDynamicSharedMemorySize(), newOpers);
528+
callOp.getDynamicSharedMemorySize(), newOpers,
529+
asyncToken ? asyncToken.getType() : nullptr,
530+
callOp.getAsyncDependencies(),
531+
/*clusterSize=*/std::nullopt);
527532
if (callOp.getClusterSizeX())
528533
newCall.getClusterSizeXMutable().assign(callOp.getClusterSizeX());
529534
if (callOp.getClusterSizeY())

flang/test/Fir/CUDA/cuda-target-rewrite.mlir

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,23 @@ module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.k
108108
}
109109
}
110110

111+
// -----
112+
113+
module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
114+
gpu.module @testmod {
115+
gpu.func @_QPtest(%arg0: complex<f32>) -> () kernel {
116+
gpu.return
117+
}
118+
}
119+
func.func @main(%arg0: complex<f32>) {
120+
%0 = llvm.mlir.constant(0 : i64) : i64
121+
%1 = llvm.mlir.constant(0 : i32) : i32
122+
%2 = fir.alloca i64
123+
%3 = cuf.stream_cast %2 : !fir.ref<i64>
124+
%4 = gpu.launch_func async [%3] @testmod::@_QPtest blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %1 args(%arg0 : complex<f32>) {cuf.proc_attr = #cuf.cuda_proc<global>}
125+
return
126+
}
127+
}
128+
129+
// CHECK-LABEL: func.func @main
130+
// CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @testmod::@_QPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 dynamic_shared_memory_size %{{.*}} args(%{{.*}} : !fir.vector<2:f32>) {cuf.proc_attr = #cuf.cuda_proc<global>}

0 commit comments

Comments
 (0)