Skip to content

Commit 49f8ccd

Browse files
authored
[flang][cuda] Pass stream information to kernel launch functions (#135246)
1 parent 641de84 commit 49f8ccd

File tree

4 files changed

+29
-15
lines changed

4 files changed

+29
-15
lines changed

flang-rt/lib/cuda/kernel.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ extern "C" {
1616

1717
void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
1818
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
19-
int32_t smem, void **params, void **extra) {
19+
intptr_t stream, int32_t smem, void **params, void **extra) {
2020
dim3 gridDim;
2121
gridDim.x = gridX;
2222
gridDim.y = gridY;
@@ -74,15 +74,15 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
7474
Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
7575
terminator.Crash("Too many invalid grid dimensions");
7676
}
77-
cudaStream_t stream = 0; // TODO stream managment
77+
cudaStream_t cuStream = 0; // TODO stream managment
7878
CUDA_REPORT_IF_ERROR(
79-
cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, stream));
79+
cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, cuStream));
8080
}
8181

8282
void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
8383
intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
8484
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
85-
int32_t smem, void **params, void **extra) {
85+
intptr_t stream, int32_t smem, void **params, void **extra) {
8686
cudaLaunchConfig_t config;
8787
config.gridDim.x = gridX;
8888
config.gridDim.y = gridY;
@@ -153,7 +153,8 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
153153

154154
void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
155155
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
156-
intptr_t blockZ, int32_t smem, void **params, void **extra) {
156+
intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
157+
void **extra) {
157158
dim3 gridDim;
158159
gridDim.x = gridX;
159160
gridDim.y = gridY;
@@ -211,9 +212,9 @@ void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
211212
Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
212213
terminator.Crash("Too many invalid grid dimensions");
213214
}
214-
cudaStream_t stream = 0; // TODO stream managment
215+
cudaStream_t cuStream = 0; // TODO stream managment
215216
CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(
216-
kernel, gridDim, blockDim, params, smem, stream));
217+
kernel, gridDim, blockDim, params, smem, cuStream));
217218
}
218219

219220
} // extern "C"

flang/include/flang/Runtime/CUDA/kernel.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,18 @@ extern "C" {
2121

2222
void RTDEF(CUFLaunchKernel)(const void *kernelName, intptr_t gridX,
2323
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
24-
intptr_t blockZ, int32_t smem, void **params, void **extra);
24+
intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
25+
void **extra);
2526

2627
void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
2728
intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
2829
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
29-
int32_t smem, void **params, void **extra);
30+
intptr_t stream, int32_t smem, void **params, void **extra);
3031

3132
void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
3233
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
33-
intptr_t blockZ, int32_t smem, void **params, void **extra);
34+
intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
35+
void **extra);
3436

3537
} // extern "C"
3638

flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ struct GPULaunchKernelConversion
121121
voidTy,
122122
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
123123
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
124-
llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
124+
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
125125
/*isVarArg=*/false);
126126
auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
127127
mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
@@ -133,14 +133,18 @@ struct GPULaunchKernelConversion
133133
launchKernelFuncOp.setVisibility(
134134
mlir::SymbolTable::Visibility::Private);
135135
}
136+
mlir::Value stream = adaptor.getAsyncObject();
137+
if (!stream)
138+
stream = rewriter.create<mlir::LLVM::ConstantOp>(
139+
loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1));
136140
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
137141
op, funcTy, cufLaunchClusterKernel,
138142
mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
139143
adaptor.getClusterSizeY(), adaptor.getClusterSizeZ(),
140144
adaptor.getGridSizeX(), adaptor.getGridSizeY(),
141145
adaptor.getGridSizeZ(), adaptor.getBlockSizeX(),
142146
adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
143-
dynamicMemorySize, kernelArgs, nullPtr});
147+
stream, dynamicMemorySize, kernelArgs, nullPtr});
144148
} else {
145149
auto procAttr =
146150
op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
@@ -153,7 +157,8 @@ struct GPULaunchKernelConversion
153157
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
154158
voidTy,
155159
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
156-
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
160+
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
161+
i32Ty, ptrTy, ptrTy},
157162
/*isVarArg=*/false);
158163
auto cufLaunchKernel =
159164
mlir::SymbolRefAttr::get(mod.getContext(), fctName);
@@ -165,12 +170,18 @@ struct GPULaunchKernelConversion
165170
launchKernelFuncOp.setVisibility(
166171
mlir::SymbolTable::Visibility::Private);
167172
}
173+
174+
mlir::Value stream = adaptor.getAsyncObject();
175+
if (!stream)
176+
stream = rewriter.create<mlir::LLVM::ConstantOp>(
177+
loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1));
178+
168179
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
169180
op, funcTy, cufLaunchKernel,
170181
mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
171182
adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
172183
adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
173-
adaptor.getBlockSizeZ(), dynamicMemorySize,
184+
adaptor.getBlockSizeZ(), stream, dynamicMemorySize,
174185
kernelArgs, nullPtr});
175186
}
176187

flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
113113
// -----
114114

115115
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
116-
llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
116+
llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
117117
llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
118118
llvm.return
119119
}

0 commit comments

Comments
 (0)