Skip to content

Commit 746ad1e

Browse files
[GPU] Add C promotion capability in promote matmul operands pass (#19256)
This PR sets up the convention that when the operand index for promotion is beyond the dpsInputs then we promote the corresponding dpsInit's tied-result. Result promotion is implemented in this PR. Co-authored-by : Quinn Dawkins <[email protected]> --------- Signed-off-by: Nirvedh <[email protected]>
1 parent cef4178 commit 746ad1e

File tree

4 files changed

+221
-9
lines changed

4 files changed

+221
-9
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp

Lines changed: 96 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,12 @@
99
#include "iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h"
1010
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
1111
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
12+
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
13+
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h"
1214
#include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
1315
#include "iree/compiler/Codegen/Utils/Utils.h"
16+
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
17+
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
1418
#include "mlir/Dialect/Linalg/IR/Linalg.h"
1519
#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
1620
#include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -25,11 +29,83 @@ namespace mlir::iree_compiler {
2529
#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
2630

2731
namespace {
32+
/// Helper to insert copy with derived thread config.
33+
Value promoteValue(OpBuilder &builder, Location loc, Value v) {
34+
auto tensorType = cast<RankedTensorType>(v.getType());
35+
SmallVector<OpFoldResult> mixedSizes = tensor::getMixedSizes(builder, loc, v);
36+
Value empty = builder.create<tensor::EmptyOp>(loc, mixedSizes,
37+
tensorType.getElementType());
38+
auto copy = builder.create<linalg::CopyOp>(loc, v, empty);
39+
setLoweringConfig(
40+
copy, IREE::GPU::DerivedThreadConfigAttr::get(builder.getContext()));
41+
return copy.getResult(0);
42+
}
43+
44+
/// Helper to promote results. If the target value is consumed only by a
45+
/// `tensor.extract_slice`, this will promote the result of the slice instead.
46+
void promoteResult(OpBuilder &builder, Operation *op, Value valToMakeShared) {
47+
IRRewriter rewriter(builder);
48+
Location loc = op->getLoc();
49+
OpBuilder::InsertionGuard g(rewriter);
50+
rewriter.setInsertionPointAfterValue(valToMakeShared);
51+
tensor::ExtractSliceOp extractSliceOp;
52+
SetVector<Operation *> opsToReplaceUseIn;
53+
Value valueToReplace = valToMakeShared;
54+
for (auto user : valToMakeShared.getUsers()) {
55+
extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(user);
56+
if (extractSliceOp) {
57+
// If the result is consumed by an extract_slice then we expect there to
58+
// be exactly one extract slice that is then consumed.
59+
// TODO (nirvedhmeshram) : This is fairly special case. Instead we should
60+
// just promote results before doing padding which introduces the extract
61+
// slice.
62+
if (!valToMakeShared.hasOneUse())
63+
return;
64+
valueToReplace = extractSliceOp.getResult();
65+
for (auto user : extractSliceOp->getUsers()) {
66+
opsToReplaceUseIn.insert(user);
67+
}
68+
break;
69+
}
70+
opsToReplaceUseIn.insert(user);
71+
}
72+
auto tensorType = cast<RankedTensorType>(valToMakeShared.getType());
73+
if (!tensorType) {
74+
return;
75+
}
76+
SmallVector<Value> dynamicSizes;
77+
for (auto [idx, size] : llvm::enumerate(tensorType.getShape())) {
78+
if (ShapedType::isDynamic(size)) {
79+
dynamicSizes.push_back(
80+
rewriter.create<tensor::DimOp>(loc, valToMakeShared, idx));
81+
}
82+
}
83+
Attribute addressSpace = gpu::AddressSpaceAttr::get(
84+
rewriter.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());
85+
auto alloc = rewriter.create<bufferization::AllocTensorOp>(loc, tensorType,
86+
dynamicSizes);
87+
alloc.setMemorySpaceAttr(addressSpace);
88+
auto copy =
89+
rewriter.create<linalg::CopyOp>(loc, valToMakeShared, alloc.getResult());
90+
91+
Value replacement = copy.getResult(0);
92+
// If in extract slice is present we make it consume the new copy.
93+
if (extractSliceOp) {
94+
extractSliceOp.getSourceMutable().assign(replacement);
95+
replacement = valueToReplace;
96+
}
97+
98+
rewriter.setInsertionPointAfterValue(replacement);
99+
replacement = promoteValue(rewriter, loc, replacement);
100+
valueToReplace.replaceUsesWithIf(replacement, [&](OpOperand &use) {
101+
return opsToReplaceUseIn.contains(use.getOwner());
102+
});
103+
}
28104

29105
/// Inserts a `linalg.copy` directly before the given operation on the
30106
/// specified operand, for example with operand index = 1:
31107
///
32-
/// linalg.matmul ins(%0, %1)
108+
/// %2 = linalg.matmul ins(%0, %1)
33109
///
34110
/// becomes
35111
///
@@ -41,7 +117,24 @@ namespace {
41117
/// If the producer is already a tilable op, the producer is just annotated with
42118
/// #iree_gpu.derived_thread_config to indicate that it should be distributed
43119
/// to threads independently of the matmul.
120+
/// Additionally we can also promote results so in above example we will
121+
/// generate for index = 2 :
122+
/// %out_buffer = bufferization.alloc_tensor
123+
/// %copy1 = linalg.copy %2 to %out_buffer
124+
/// %copy2 = linalg.copy %copy1 to %empty {
125+
/// lowering_config = #iree_gpu.derived_thread_config}
44126
void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) {
127+
auto dpsOp = dyn_cast<DestinationStyleOpInterface>(op);
128+
if (!dpsOp)
129+
return;
130+
// We use the convention that if we are passing an index beyond the inputs
131+
// then we promote the result of the corresponding dps init.
132+
if (index >= dpsOp.getNumDpsInputs()) {
133+
index -= dpsOp.getNumDpsInputs();
134+
assert(index < op->getNumResults() &&
135+
"trying to promote out of bound result index");
136+
return promoteResult(builder, op, op->getResult(index));
137+
}
45138
Value operand = op->getOperand(index);
46139

47140
if (auto producer = operand.getDefiningOp<TilingInterface>()) {
@@ -70,14 +163,8 @@ void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) {
70163
return;
71164
}
72165

73-
SmallVector<OpFoldResult> mixedSizes =
74-
tensor::getMixedSizes(builder, op->getLoc(), operand);
75-
Value empty = builder.create<tensor::EmptyOp>(op->getLoc(), mixedSizes,
76-
tensorType.getElementType());
77-
auto copy = builder.create<linalg::CopyOp>(op->getLoc(), operand, empty);
78-
setLoweringConfig(
79-
copy, IREE::GPU::DerivedThreadConfigAttr::get(builder.getContext()));
80-
op->setOperand(index, copy.getResult(0));
166+
auto replacement = promoteValue(builder, op->getLoc(), operand);
167+
op->setOperand(index, replacement);
81168
}
82169

83170
struct GPUPromoteMatmulOperandsPass final

compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,8 @@ def GPUPromoteMatmulOperandsPass :
161161
let summary = "Pass to insert copies with a different thread configuration "
162162
"on matmul operands";
163163
let dependentDialects = [
164+
"::mlir::bufferization::BufferizationDialect",
165+
"::mlir::gpu::GPUDialect",
164166
"::mlir::linalg::LinalgDialect",
165167
"::mlir::iree_compiler::IREE::GPU::IREEGPUDialect"
166168
];

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,54 @@ func.func @promote_pad(%a : tensor<4x127xf32>, %b: tensor<128x128xf32>) -> tenso
106106
// CHECK: linalg.copy
107107
// CHECK-SAME: derived_thread_config
108108
// CHECK: return
109+
110+
// -----
111+
112+
#lowering_config = #iree_gpu.lowering_config<{promote_operands = [2]}>
113+
func.func @promote_result(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>, %mdim : index, %ndim : index) -> tensor<?x?xf32> {
114+
%cst = arith.constant 0.000000e+00 : f32
115+
%empty = tensor.empty(%mdim, %ndim) : tensor<?x?xf32>
116+
%fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
117+
%mm = linalg.matmul {lowering_config = #lowering_config}
118+
ins(%a, %b : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
119+
return %mm : tensor<?x?xf32>
120+
}
121+
122+
// CHECK-LABEL: func @promote_result(
123+
// CHECK: %[[MATMUL:.+]] = linalg.matmul
124+
// CHECK: %[[ALLOC:.+]] = bufferization.alloc_tensor
125+
// CHECK: %[[COPY1:.+]] = linalg.copy
126+
// CHECK-SAME: ins(%[[MATMUL]] : tensor<?x?xf32>) outs(%[[ALLOC]] : tensor<?x?xf32>)
127+
// CHECK-SAME: -> tensor<?x?xf32>
128+
// CHECK: %[[COPY2:.+]] = linalg.copy
129+
// CHECK-SAME: {lowering_config = #iree_gpu.derived_thread_config}
130+
// CHECK-SAME: ins(%[[COPY1]] : tensor<?x?xf32>)
131+
// CHECK: return %[[COPY2]] : tensor<?x?xf32>
132+
133+
// -----
134+
135+
#lowering_config = #iree_gpu.lowering_config<{promote_operands = [2]}>
136+
func.func @promote_padded_result(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>, %mdim : index, %ndim : index, %pad : index, %slice : index) -> tensor<?x?xf32> {
137+
%cst = arith.constant 0.000000e+00 : f32
138+
%empty = tensor.empty(%mdim, %ndim) : tensor<?x?xf32>
139+
%fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
140+
%padded_fill = tensor.pad %fill low[0, 0] high[%pad, %pad] {
141+
^bb0(%arg3: index, %arg4: index):
142+
tensor.yield %cst : f32
143+
} : tensor<?x?xf32> to tensor<?x?xf32>
144+
%mm = linalg.matmul {lowering_config = #lowering_config}
145+
ins(%a, %b : tensor<?x?xf32>, tensor<?x?xf32>) outs(%padded_fill : tensor<?x?xf32>) -> tensor<?x?xf32>
146+
%mm_slice = tensor.extract_slice %mm [0, 0] [%slice, %slice] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
147+
return %mm_slice : tensor<?x?xf32>
148+
}
149+
150+
// CHECK-LABEL: func @promote_padded_result(
151+
// CHECK: %[[MATMUL:.+]] = linalg.matmul
152+
// CHECK: %[[ALLOC:.+]] = bufferization.alloc_tensor
153+
// CHECK: %[[COPY1:.+]] = linalg.copy
154+
// CHECK-SAME: ins(%[[MATMUL]] : tensor<?x?xf32>) outs(%[[ALLOC]] : tensor<?x?xf32>)
155+
// CHECK: %[[EXTRACT:.+]] = tensor.extract_slice %[[COPY1]]
156+
// CHECK: %[[COPY2:.+]] = linalg.copy
157+
// CHECK-SAME: {lowering_config = #iree_gpu.derived_thread_config}
158+
// CHECK-SAME: ins(%[[EXTRACT]] : tensor<?x?xf32>)
159+
// CHECK: return %[[COPY2]] : tensor<?x?xf32>

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,3 +1019,75 @@ hal.executable public @main {
10191019
// CHECK: scf.for
10201020
// CHECK-COUNT-4: arith.addf {{.*}} : vector<9xf32>
10211021
// CHECK: vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type<storage_buffer>>
1022+
1023+
// -----
1024+
1025+
#pipeline_layout = #hal.pipeline.layout<bindings = [
1026+
#hal.pipeline.binding<storage_buffer>,
1027+
#hal.pipeline.binding<storage_buffer>,
1028+
#hal.pipeline.binding<storage_buffer>
1029+
]>
1030+
#config = #iree_gpu.lowering_config<{
1031+
workgroup = [64, 64, 0],
1032+
reduction = [0, 0, 2],
1033+
subgroup = [2, 2],
1034+
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
1035+
promote_operands = [0, 1, 2]
1036+
}>
1037+
hal.executable public @main {
1038+
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
1039+
hal.executable.export public @matmul_transpose_b_promote_result ordinal(0) layout(#pipeline_layout) {
1040+
^bb0(%arg0: !hal.device):
1041+
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
1042+
hal.return %x, %y, %z : index, index, index
1043+
}
1044+
builtin.module {
1045+
func.func @matmul_transpose_b_promote_result()
1046+
attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [128, 2, 1] subgroup_size = 64>} {
1047+
%cst = arith.constant 0.000000e+00 : f16
1048+
%c0 = arith.constant 0 : index
1049+
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>>
1050+
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>>
1051+
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
1052+
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>> -> tensor<2048x1280xf16>
1053+
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>> -> tensor<10240x1280xf16>
1054+
%5 = tensor.empty() : tensor<2048x10240xf32>
1055+
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32>
1056+
%7 = linalg.matmul_transpose_b {lowering_config = #config}
1057+
ins(%3, %4 : tensor<2048x1280xf16>, tensor<10240x1280xf16>)
1058+
outs(%6 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32>
1059+
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
1060+
return
1061+
}
1062+
}
1063+
}
1064+
}
1065+
1066+
// CHECK-LABEL: func @matmul_transpose_b_promote_result
1067+
// CHECK-DAG: %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
1068+
// CHECK-DAG: %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
1069+
// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
1070+
// CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
1071+
// CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
1072+
// CHECK-DAG: memref.alloc() : memref<4x16x4x16xf32, #gpu.address_space<workgroup>>
1073+
// CHECK: scf.forall ({{.*}}) in (32, 160) {
1074+
// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x1xf32>)
1075+
// CHECK: gpu.barrier
1076+
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
1077+
// CHECK-DAG: vector.transfer_write %[[LHS_RD]]
1078+
// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16>
1079+
// CHECK-DAG: vector.transfer_write %[[RHS_RD]]
1080+
// CHECK: gpu.barrier
1081+
// CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<2x1x2x4xf16>
1082+
// CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<2x1x2x4xf16>
1083+
// CHECK-DAG: vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16>
1084+
// CHECK-DAG: vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16>
1085+
// CHECK-COUNT-4: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
1086+
// CHECK: scf.yield
1087+
// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 1, 3] : vector<2x2x4x1xf32> to vector<2x4x2x1xf32>
1088+
// CHECK: vector.transfer_write %[[LOOP_T]]
1089+
// CHECK: scf.for {{.*}} {
1090+
// CHECK: %[[SHARED_READ:.+]] = vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<4xf32>
1091+
// CHECK: vector.transfer_write %[[SHARED_READ]], %[[B2]]
1092+
// CHECK: }
1093+
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}

0 commit comments

Comments
 (0)