Skip to content

Commit 4889214

Browse files
committed
[mlir][sparse][gpu] generate single module, unique kernel names
This fixes a TODO in the first version. Reviewed By: Peiming Differential Revision: https://reviews.llvm.org/D148406
1 parent a753eca commit 4889214

File tree

4 files changed

+55
-16
lines changed

4 files changed

+55
-16
lines changed

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,24 +40,36 @@ static void markAsGPUContainer(ModuleOp topModule) {
4040
UnitAttr::get(topModule->getContext()));
4141
}
4242

43-
/// Constructs a new GPU module (for GPU kernels) inside the given top module.
44-
static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule,
45-
StringRef name) {
43+
/// Constructs a new GPU module (for GPU kernels) inside the given top module,
44+
/// or returns an existing GPU module if one was built previously.
45+
static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule) {
46+
for (auto op : topModule.getBodyRegion().getOps<gpu::GPUModuleOp>())
47+
return op; // existing
4648
markAsGPUContainer(topModule);
4749
builder.setInsertionPointToStart(&topModule.getBodyRegion().front());
48-
return builder.create<gpu::GPUModuleOp>(topModule->getLoc(), name);
50+
return builder.create<gpu::GPUModuleOp>(topModule->getLoc(),
51+
"sparse_kernels");
4952
}
5053

5154
/// Constructs a new GPU kernel in the given GPU module.
5255
static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule,
53-
StringRef name, SmallVectorImpl<Value> &args) {
56+
SmallVectorImpl<Value> &args) {
57+
// Get a unique kernel name. Not very creative,
58+
// but we simply try kernel0, kernel1, etc.
59+
unsigned kernelNumber = 0;
60+
SmallString<16> kernelName;
61+
do {
62+
kernelName.clear();
63+
("kernel" + Twine(kernelNumber++)).toStringRef(kernelName);
64+
} while (gpuModule.lookupSymbol(kernelName));
65+
// Then we insert a new kernel with given arguments into the module.
5466
builder.setInsertionPointToStart(&gpuModule.getBodyRegion().front());
5567
SmallVector<Type> argsTp;
5668
for (unsigned i = 0, e = args.size(); i < e; i++)
5769
argsTp.push_back(args[i].getType());
5870
FunctionType type = FunctionType::get(gpuModule->getContext(), argsTp, {});
5971
auto gpuFunc =
60-
builder.create<gpu::GPUFuncOp>(gpuModule->getLoc(), name, type);
72+
builder.create<gpu::GPUFuncOp>(gpuModule->getLoc(), kernelName, type);
6173
gpuFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
6274
builder.getUnitAttr());
6375
return gpuFunc;
@@ -208,12 +220,9 @@ struct ForallRewriter : public OpRewritePattern<scf::ParallelOp> {
208220
args.push_back(genHostRegisterMemref(rewriter, loc, b));
209221
auto saveIp = rewriter.saveInsertionPoint();
210222
// Set up GPU module and construct GPU function.
211-
//
212-
// TODO: only generate once, avoid name conflict
213-
//
214223
ModuleOp topModule = forallOp->getParentOfType<ModuleOp>();
215-
auto gpuModule = genGPUModule(rewriter, topModule, "sparsekernels");
216-
auto gpuFunc = genGPUFunc(rewriter, gpuModule, "kernel", args);
224+
auto gpuModule = genGPUModule(rewriter, topModule);
225+
auto gpuFunc = genGPUFunc(rewriter, gpuModule, args);
217226
genGPUCode(rewriter, gpuFunc, forallOp, constants, scalars, buffers);
218227
// Generate code that launches the kernel.
219228
rewriter.restoreInsertionPoint(saveIp);
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
// RUN: mlir-opt %s --linalg-generalize-named-ops \
2+
// RUN: --pre-sparsification-rewrite \
3+
// RUN: --sparsification="parallelization-strategy=dense-outer-loop" \
4+
// RUN: --sparse-gpu-codegen | FileCheck %s
5+
6+
#CSR = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>
7+
8+
//
9+
// CHECK-LABEL: gpu.module @sparse_kernels
10+
// CHECK-DAG: gpu.func @kernel0
11+
// CHECK-DAG: gpu.func @kernel1
12+
//
13+
// CHECK-LABEL: func.func @matmuls
14+
// CHECK-DAG: gpu.launch_func @sparse_kernels::@kernel0 blocks
15+
// CHECK-DAG: gpu.launch_func @sparse_kernels::@kernel1 blocks
16+
//
17+
func.func @matmuls(%A: tensor<1024x8xf64>,
18+
%B: tensor<8x1024xf64, #CSR>,
19+
%C: tensor<1024x1024xf64, #CSR>) -> tensor<1024x1024xf64> {
20+
%Z = arith.constant dense<0.0> : tensor<1024x1024xf64>
21+
%T = linalg.matmul
22+
ins(%A, %B: tensor<1024x8xf64>, tensor<8x1024xf64, #CSR>)
23+
outs(%Z: tensor<1024x1024xf64>) -> tensor<1024x1024xf64>
24+
%D = linalg.matmul
25+
ins(%T, %C: tensor<1024x1024xf64>, tensor<1024x1024xf64, #CSR>)
26+
outs(%Z: tensor<1024x1024xf64>) -> tensor<1024x1024xf64>
27+
return %D : tensor<1024x1024xf64>
28+
}
29+

mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
//
99
// Compute matrix matrix C = AB
1010
//
11-
// CHECK-LABEL: gpu.func @kernel(
11+
// CHECK-LABEL: gpu.module @sparse_kernels
12+
// CHECK-LABEL: gpu.func @kernel0(
1213
// CHECK-SAME: %[[VAL_0:.*0]]: index,
1314
// CHECK-SAME: %[[VAL_1:.*1]]: index,
1415
// CHECK-SAME: %[[VAL_2:.*2]]: memref<?xindex>,
@@ -51,7 +52,7 @@
5152
// CHECK: gpu.host_register
5253
// CHECK: gpu.host_register
5354
// CHECK: gpu.host_register
54-
// CHECK: gpu.launch_func @sparsekernels::@kernel blocks
55+
// CHECK: gpu.launch_func @sparse_kernels::@kernel0 blocks
5556
//
5657
func.func @matmul(%A: tensor<?x?xf64, #CSR>, %B: tensor<?x?xf64>, %C_in: tensor<?x?xf64>) -> tensor<?x?xf64> {
5758
%C_out = linalg.matmul

mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
//
99
// Compute matrix vector y = Ax
1010
//
11-
//
12-
// CHECK: gpu.func @kernel(
11+
// CHECK-LABEL: gpu.module @sparse_kernels
12+
// CHECK: gpu.func @kernel0(
1313
// CHECK-SAME: %[[VAL_0:.*0]]: index,
1414
// CHECK-SAME: %[[VAL_1:.*1]]: memref<?xf64>,
1515
// CHECK-SAME: %[[VAL_2:.*2]]: memref<?xindex>,
@@ -48,7 +48,7 @@
4848
// CHECK: gpu.host_register
4949
// CHECK: gpu.host_register
5050
// CHECK: gpu.host_register
51-
// CHECK: gpu.launch_func @sparsekernels::@kernel blocks
51+
// CHECK: gpu.launch_func @sparse_kernels::@kernel0 blocks
5252
//
5353
func.func @matvec(%A: tensor<?x?xf64, #CSR>, %x: tensor<?xf64>, %y_in: tensor<?xf64>) -> tensor<?xf64> {
5454
%y_out = linalg.matvec

0 commit comments

Comments
 (0)