diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index d08e7ceb9e6c6..42a017db300af 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -803,7 +803,9 @@ def GPU_LaunchOp : GPU_Op<"launch", [ Optional:$clusterSizeX, Optional:$clusterSizeY, Optional:$clusterSizeZ, - Optional:$dynamicSharedMemorySize)>, + Optional:$dynamicSharedMemorySize, + OptionalAttr:$kernelFunc, + OptionalAttr:$kernelModule)>, Results<(outs Optional:$asyncToken)> { let summary = "GPU kernel launch operation"; @@ -837,6 +839,9 @@ def GPU_LaunchOp : GPU_Op<"launch", [ - a variadic number of Workgroup memory attributions. - a variadic number of Private memory attributions. + The `kernelFunc` and `kernelModule` attributes are optional and specifies + the kernel name and a module in which the kernel should be outlined. + Syntax: ``` diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp index 5f6556d915f41..ba0c80c50211e 100644 --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -364,9 +364,15 @@ class GpuKernelOutliningPass Block::iterator insertPt(func->getNextNode()); auto funcWalkResult = func.walk([&](gpu::LaunchOp op) { SetVector operands; - std::string kernelFnName = - Twine(op->getParentOfType().getName(), "_kernel") - .str(); + std::string kernelFnName; + if (op.getKernelFunc()) { + kernelFnName = op.getKernelFunc()->getRootReference().str(); + } else { + kernelFnName = + Twine(op->getParentOfType().getName(), + "_kernel") + .str(); + } gpu::GPUFuncOp outlinedFunc = outlineKernelFuncImpl(op, kernelFnName, operands); @@ -374,7 +380,7 @@ class GpuKernelOutliningPass // Create nested module and insert outlinedFunc. The module will // originally get the same name as the function, but may be renamed on // insertion into the parent module. - auto kernelModule = createKernelModule(outlinedFunc, symbolTable); + auto kernelModule = createKernelModule(op, outlinedFunc, symbolTable); symbolTable.insert(kernelModule, insertPt); // Potentially changes signature, pulling in constants. @@ -395,7 +401,8 @@ class GpuKernelOutliningPass private: /// Returns a gpu.module containing kernelFunc and all callees (recursive). - gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, + gpu::GPUModuleOp createKernelModule(gpu::LaunchOp gpuLaunchOp, + gpu::GPUFuncOp kernelFunc, const SymbolTable &parentSymbolTable) { // TODO: This code cannot use an OpBuilder because it must be inserted into // a SymbolTable by the caller. SymbolTable needs to be refactored to @@ -403,8 +410,23 @@ class GpuKernelOutliningPass // and then this needs to use the OpBuilder. auto *context = getOperation().getContext(); OpBuilder builder(context); - auto kernelModule = builder.create(kernelFunc.getLoc(), - kernelFunc.getName()); + std::string kernelModuleName; + gpu::GPUModuleOp kernelModule; + if (gpuLaunchOp.getKernelModule()) { + kernelModuleName = + gpuLaunchOp.getKernelModule()->getRootReference().str(); + kernelModule = + parentSymbolTable.lookup(kernelModuleName); + } else { + kernelModuleName = kernelFunc.getName(); + } + + // Check if the module already exists in the symbol table + if (!kernelModule) { + // If not found, create a new GPU module + kernelModule = builder.create(kernelFunc.getLoc(), + kernelModuleName); + } // If a valid data layout spec was provided, attach it to the kernel module. // Otherwise, the default data layout will be used. diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir index 6e682b26f6c95..d48fa054432d1 100644 --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -508,3 +508,125 @@ func.func @launch_cluster() { // CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> () // CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref +// ----- +// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch +// CHECK-LABEL: func.func @testKernelAttributes() +// CHECK: gpu.launch_func @test_module::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) +// CHECK: gpu.module @test_module +// CHECK: gpu.func @test_kernel_func() +func.func @testKernelAttributes() { + %gDimX = arith.constant 8 : index + %gDimY = arith.constant 12 : index + %gDimZ = arith.constant 16 : index + %bDimX = arith.constant 32 : index + %bDimY = arith.constant 16 : index + %bDimZ = arith.constant 8 : index + + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { + "some_op"(%bx, %tx) : (index, index) -> () + gpu.terminator + } {kernelModule = @test_module, kernelFunc = @test_kernel_func} + return +} + +// ----- +// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch, when kernelModule already exists. + +// CHECK-LABEL: gpu.module @existing_module +// CHECK: gpu.func @test_kernel_func() +// CHECK: gpu.func @test_kernel_func_0() +// CHECK-NOT: gpu.module @testExistingModule_kernel +// CHECK-NOT: gpu.func @testExistingModule_kernel() +// CHECK: func.func @testExistingModule() +// CHECK: gpu.launch_func @existing_module::@test_kernel_func_0 blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) + +gpu.module @existing_module { + gpu.func @test_kernel_func() { + gpu.return + } +} + +func.func @testExistingModule() { + %gDimX = arith.constant 8 : index + %gDimY = arith.constant 12 : index + %gDimZ = arith.constant 16 : index + %bDimX = arith.constant 32 : index + %bDimY = arith.constant 16 : index + %bDimZ = arith.constant 8 : index + + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { + "some_op"(%bx, %tx) : (index, index) -> () + gpu.terminator + } {kernelModule = @existing_module, kernelFunc = @test_kernel_func} + return +} + +// ----- +// This test tests the optional attribute kernelModule for gpu.launch. +// CHECK-LABEL: func.func @testKernelModuleOnly() +// CHECK: gpu.launch_func @test_module::@testKernelModuleOnly_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) +// CHECK: gpu.module @test_module +// CHECK: gpu.func @testKernelModuleOnly_kernel() +func.func @testKernelModuleOnly() { + %gDimX = arith.constant 8 : index + %gDimY = arith.constant 12 : index + %gDimZ = arith.constant 16 : index + %bDimX = arith.constant 32 : index + %bDimY = arith.constant 16 : index + %bDimZ = arith.constant 8 : index + + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { + "some_op"(%bx, %tx) : (index, index) -> () + gpu.terminator + } {kernelModule = @test_module} + return +} + +// ----- +// This test tests the optional attribute kernelFunc for gpu.launch. +// CHECK-LABEL: func.func @testKernelFuncOnly() +// CHECK: gpu.launch_func @test_kernel_func::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) + +// CHECK: gpu.module @test_kernel_func +// CHECK: gpu.func @test_kernel_func() +func.func @testKernelFuncOnly() { + %gDimX = arith.constant 8 : index + %gDimY = arith.constant 12 : index + %gDimZ = arith.constant 16 : index + %bDimX = arith.constant 32 : index + %bDimY = arith.constant 16 : index + %bDimZ = arith.constant 8 : index + + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { + "some_op"(%bx, %tx) : (index, index) -> () + gpu.terminator + } {kernelFunc = @test_kernel_func} + return +} + +// ----- +// This test tests gpu.launch when optional attributes kernelModule and kernelFunc are not specified. +// CHECK-LABEL: func.func @testNoAttributes() +// CHECK: gpu.launch_func @testNoAttributes_kernel::@testNoAttributes_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) + +// CHECK: gpu.module @testNoAttributes_kernel +// CHECK: gpu.func @testNoAttributes_kernel() +func.func @testNoAttributes() { + %gDimX = arith.constant 8 : index + %gDimY = arith.constant 12 : index + %gDimZ = arith.constant 16 : index + %bDimX = arith.constant 32 : index + %bDimY = arith.constant 16 : index + %bDimZ = arith.constant 8 : index + + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { + "some_op"(%bx, %tx) : (index, index) -> () + gpu.terminator + } + return +}