Skip to content

Commit f828914

Browse files
[GPU] Move tile and distribute pass before packing to intrinsic for TileAndfuse pipeline (#19053)
We want to first distribute to workgroups so that we can promote operands to handle unaligned to intrinsic cases before we concretize the mma shapes. Signed-off-by: Nirvedh Meshram <[email protected]>
1 parent 8391943 commit f828914

File tree

6 files changed

+25
-17
lines changed

6 files changed

+25
-17
lines changed

compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,11 +281,19 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
281281
for (auto [i, mDim] : llvm::enumerate(mDims)) {
282282
workgroupTileSizes[mDim] =
283283
schedule->mSubgroupCounts[i] * schedule->mTileSizes[i];
284+
// Multiply by the intrinsic shape for the inner most dim as we distribute
285+
// to workgroups before packing to intrinsic.
286+
if (i == mDims.size() - 1)
287+
workgroupTileSizes[mDim] *= schedule->mSize;
284288
subgroupTileSizes[mDim] = schedule->mTileSizes[i];
285289
}
286290
for (auto [i, nDim] : llvm::enumerate(nDims)) {
287291
workgroupTileSizes[nDim] =
288292
schedule->nSubgroupCounts[i] * schedule->nTileSizes[i];
293+
// Multiply by the intrinsic shape for the inner most dim as we distribute
294+
// to workgroups before packing to intrinsic.
295+
if (i == nDims.size() - 1)
296+
workgroupTileSizes[nDim] *= schedule->nSize;
289297
subgroupTileSizes[nDim] = schedule->nTileSizes[i];
290298
}
291299

compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,9 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
341341
funcPassManager.addPass(createConvolutionToIGEMMPass());
342342
}
343343

344+
tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true,
345+
/*convertToDpsOptions=*/std::nullopt);
346+
344347
// Step 1. Promote matmul operands and pack to intrinsic shapes.
345348
funcPassManager.addPass(createGPUPromoteMatmulOperandsPass());
346349
funcPassManager.addPass(IREE::GPU::createPackToIntrinsicsPass());
@@ -357,9 +360,6 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
357360
}
358361
funcPassManager.addPass(createPropagateReshapesByExpansionPass());
359362

360-
tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true,
361-
/*convertToDpsOptions=*/std::nullopt);
362-
363363
// Step 2. Tile and fuse tileable ops to reduction loops.
364364
{
365365
GPUApplyTilingLevelPassOptions options;

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ func.func @nhwc_conv_mfma() {
2626
// CHECK-SAME: promote_operands = [0, 1]
2727
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
2828
// CHECK-SAME: subgroup = [1, 2, 2, 1, 0]
29-
// CHECK-SAME: workgroup = [1, 2, 2, 4, 0]
29+
// CHECK-SAME: workgroup = [1, 2, 32, 64, 0]
3030

3131
// -----
3232

@@ -55,7 +55,7 @@ func.func @nchw_conv_mfma() {
5555
// CHECK-SAME: promote_operands = [0, 1]
5656
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
5757
// CHECK-SAME: subgroup = [1, 2, 2, 1, 0]
58-
// CHECK-SAME: workgroup = [1, 4, 2, 2, 0]
58+
// CHECK-SAME: workgroup = [1, 64, 2, 32, 0]
5959

6060
// -----
6161

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
3939
// CHECK-SAME: promote_operands = [0, 1]
4040
// CHECK-SAME: reduction = [0, 0, 0, 0, 4]
4141
// CHECK-SAME: subgroup = [1, 1, 4, 1, 0]
42-
// CHECK-SAME: workgroup = [1, 1, 4, 4, 0]
42+
// CHECK-SAME: workgroup = [1, 1, 64, 64, 0]
4343

4444
// -----
4545

@@ -72,7 +72,7 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
7272
// CHECK-SAME: promote_operands = [0, 1]
7373
// CHECK-SAME: reduction = [0, 0, 0, 0, 4, 1]
7474
// CHECK-SAME: subgroup = [2, 2, 1, 1, 0, 0]
75-
// CHECK-SAME: workgroup = [2, 2, 2, 2, 0, 0]
75+
// CHECK-SAME: workgroup = [2, 2, 32, 32, 0, 0]
7676

7777
// -----
7878

@@ -107,7 +107,7 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor<?x6x16x?x16xf16>, %rhs: t
107107
// CHECK-SAME: promote_operands = [0, 1]
108108
// CHECK-SAME: reduction = [0, 0, 0, 0, 0, 1, 1]
109109
// CHECK-SAME: subgroup = [0, 1, 0, 1, 1, 0, 0]
110-
// CHECK-SAME: workgroup = [1, 2, 1, 1, 2, 0, 0]
110+
// CHECK-SAME: workgroup = [1, 2, 1, 16, 32, 0, 0]
111111

112112
// -----
113113

@@ -132,7 +132,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
132132
// CHECK-SAME: promote_operands = [0, 1]
133133
// CHECK-SAME: reduction = [0, 0, 2]
134134
// CHECK-SAME: subgroup = [4, 4, 0]
135-
// CHECK-SAME: workgroup = [8, 8, 0]
135+
// CHECK-SAME: workgroup = [128, 128, 0]
136136

137137
// -----
138138

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
use_igemm_convolution = true>
1818
}>
1919
#config = #iree_gpu.lowering_config<{
20-
workgroup = [1, 4, 1, 16, 0],
20+
workgroup = [1, 4, 16, 256, 0],
2121
reduction = [0, 0, 0, 0, 2],
2222
subgroup = [1, 4, 1, 4, 0],
2323
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ hal.executable public @main {
7373
#hal.pipeline.binding<storage_buffer>
7474
]>
7575
#config = #iree_gpu.lowering_config<{
76-
workgroup = [4, 4, 0],
76+
workgroup = [64, 64, 0],
7777
reduction = [0, 0, 2],
7878
subgroup = [2, 2],
7979
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
@@ -140,7 +140,7 @@ hal.executable public @main {
140140
#hal.pipeline.binding<storage_buffer>
141141
]>
142142
#config = #iree_gpu.lowering_config<{
143-
workgroup = [4, 4, 0],
143+
workgroup = [64, 64, 0],
144144
reduction = [0, 0, 2],
145145
subgroup = [2, 2],
146146
mma_kind = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
@@ -207,7 +207,7 @@ hal.executable public @main {
207207
#hal.pipeline.binding<storage_buffer>
208208
]>
209209
#config = #iree_gpu.lowering_config<{
210-
workgroup = [4, 4, 0],
210+
workgroup = [64, 64, 0],
211211
reduction = [0, 0, 2],
212212
subgroup = [2, 2],
213213
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
@@ -263,7 +263,7 @@ hal.executable public @main {
263263
#hal.pipeline.binding<storage_buffer>
264264
]>
265265
#config = #iree_gpu.lowering_config<{
266-
workgroup = [4, 4, 0],
266+
workgroup = [64, 64, 0],
267267
reduction = [0, 0, 2],
268268
subgroup = [2, 2],
269269
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
@@ -319,7 +319,7 @@ hal.executable public @main {
319319
#hal.pipeline.binding<storage_buffer>
320320
]>
321321
#config = #iree_gpu.lowering_config<{
322-
workgroup = [2, 2, 0],
322+
workgroup = [64, 64, 0],
323323
reduction = [0, 0, 2],
324324
subgroup = [1, 1],
325325
mma_kind = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>,
@@ -375,7 +375,7 @@ hal.executable public @main {
375375
#hal.pipeline.binding<storage_buffer>
376376
]>
377377
#config = #iree_gpu.lowering_config<{
378-
workgroup = [4, 4, 0],
378+
workgroup = [64, 64, 0],
379379
reduction = [0, 0, 2],
380380
subgroup = [2, 2],
381381
mma_kind = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F16>,
@@ -578,7 +578,7 @@ hal.executable public @main {
578578
mma_kind = #iree_gpu.mma_layout<WMMA_I32_16x16x16_I8>,
579579
reduction = [0, 0, 4],
580580
subgroup = [2, 4, 0],
581-
workgroup = [4, 8, 0],
581+
workgroup = [64, 128, 0],
582582
promote_operands = [0, 1]
583583
}>
584584

0 commit comments

Comments
 (0)