[GPU] Move tile and distribute pass before packing to intrinsic for TileAndfuse pipeline (#19053)

nirvedhmeshram · web-flow · commit f82891431f37 · 2024-11-14T11:09:07.000-06:00
We want to first distribute to workgroups so that we can promote
operands to handle unaligned to intrinsic cases before we concretize the
mma shapes.

Signed-off-by: Nirvedh Meshram &lt;nirvedh@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -281,11 +281,19 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   for (auto [i, mDim] : llvm::enumerate(mDims)) {
     workgroupTileSizes[mDim] =
         schedule->mSubgroupCounts[i] * schedule->mTileSizes[i];
+    // Multiply by the intrinsic shape for the inner most dim as we distribute
+    // to workgroups before packing to intrinsic.
+    if (i == mDims.size() - 1)
+      workgroupTileSizes[mDim] *= schedule->mSize;
     subgroupTileSizes[mDim] = schedule->mTileSizes[i];
   }
   for (auto [i, nDim] : llvm::enumerate(nDims)) {
     workgroupTileSizes[nDim] =
         schedule->nSubgroupCounts[i] * schedule->nTileSizes[i];
+    // Multiply by the intrinsic shape for the inner most dim as we distribute
+    // to workgroups before packing to intrinsic.
+    if (i == nDims.size() - 1)
+      workgroupTileSizes[nDim] *= schedule->nSize;
     subgroupTileSizes[nDim] = schedule->nTileSizes[i];
   }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -341,6 +341,9 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
     funcPassManager.addPass(createConvolutionToIGEMMPass());
   }
 
+  tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true,
+                               /*convertToDpsOptions=*/std::nullopt);
+
   // Step 1. Promote matmul operands and pack to intrinsic shapes.
   funcPassManager.addPass(createGPUPromoteMatmulOperandsPass());
   funcPassManager.addPass(IREE::GPU::createPackToIntrinsicsPass());
@@ -357,9 +360,6 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
   }
   funcPassManager.addPass(createPropagateReshapesByExpansionPass());
 
-  tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true,
-                               /*convertToDpsOptions=*/std::nullopt);
-
   // Step 2. Tile and fuse tileable ops to reduction loops.
   {
     GPUApplyTilingLevelPassOptions options;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir
@@ -26,7 +26,7 @@ func.func @nhwc_conv_mfma() {
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 0, 0, 8]
 //  CHECK-SAME:     subgroup = [1, 2, 2, 1, 0]
-//  CHECK-SAME:     workgroup = [1, 2, 2, 4, 0]
+//  CHECK-SAME:     workgroup = [1, 2, 32, 64, 0]
 
 // -----
 
@@ -55,7 +55,7 @@ func.func @nchw_conv_mfma() {
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 0, 0, 8]
 //  CHECK-SAME:     subgroup = [1, 2, 2, 1, 0]
-//  CHECK-SAME:     workgroup = [1, 4, 2, 2, 0]
+//  CHECK-SAME:     workgroup = [1, 64, 2, 32, 0]
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -39,7 +39,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 0, 0, 4]
 //  CHECK-SAME:     subgroup = [1, 1, 4, 1, 0]
-//  CHECK-SAME:     workgroup = [1, 1, 4, 4, 0]
+//  CHECK-SAME:     workgroup = [1, 1, 64, 64, 0]
 
 // -----
 
@@ -72,7 +72,7 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 0, 0, 4, 1]
 //  CHECK-SAME:     subgroup = [2, 2, 1, 1, 0, 0]
-//  CHECK-SAME:     workgroup = [2, 2, 2, 2, 0, 0]
+//  CHECK-SAME:     workgroup = [2, 2, 32, 32, 0, 0]
 
 // -----
 
@@ -107,7 +107,7 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor<?x6x16x?x16xf16>, %rhs: t
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 0, 0, 0, 1, 1]
 //  CHECK-SAME:     subgroup = [0, 1, 0, 1, 1, 0, 0]
-//  CHECK-SAME:     workgroup = [1, 2, 1, 1, 2, 0, 0]
+//  CHECK-SAME:     workgroup = [1, 2, 1, 16, 32, 0, 0]
 
 // -----
 
@@ -132,7 +132,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 2]
 //  CHECK-SAME:     subgroup = [4, 4, 0]
-//  CHECK-SAME:     workgroup = [8, 8, 0]
+//  CHECK-SAME:     workgroup = [128, 128, 0]
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
@@ -17,7 +17,7 @@
        use_igemm_convolution = true>
   }>
 #config = #iree_gpu.lowering_config<{
-  workgroup = [1, 4, 1, 16, 0],
+  workgroup = [1, 4, 16, 256, 0],
   reduction = [0, 0, 0, 0, 2],
   subgroup = [1, 4, 1, 4, 0],
   mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -73,7 +73,7 @@ hal.executable public @main {
   #hal.pipeline.binding<storage_buffer>
 ]>
 #config = #iree_gpu.lowering_config<{
-  workgroup = [4, 4, 0],
+  workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
   subgroup = [2, 2],
   mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
@@ -140,7 +140,7 @@ hal.executable public @main {
   #hal.pipeline.binding<storage_buffer>
 ]>
 #config = #iree_gpu.lowering_config<{
-  workgroup = [4, 4, 0],
+  workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
   subgroup = [2, 2],
   mma_kind = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
@@ -207,7 +207,7 @@ hal.executable public @main {
   #hal.pipeline.binding<storage_buffer>
 ]>
 #config = #iree_gpu.lowering_config<{
-  workgroup = [4, 4, 0],
+  workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
   subgroup = [2, 2],
   mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
@@ -263,7 +263,7 @@ hal.executable public @main {
   #hal.pipeline.binding<storage_buffer>
 ]>
 #config = #iree_gpu.lowering_config<{
-  workgroup = [4, 4, 0],
+  workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
   subgroup = [2, 2],
   mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
@@ -319,7 +319,7 @@ hal.executable public @main {
   #hal.pipeline.binding<storage_buffer>
 ]>
 #config = #iree_gpu.lowering_config<{
-  workgroup = [2, 2, 0],
+  workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
   subgroup = [1, 1],
   mma_kind = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>,
@@ -375,7 +375,7 @@ hal.executable public @main {
   #hal.pipeline.binding<storage_buffer>
 ]>
 #config = #iree_gpu.lowering_config<{
-  workgroup = [4, 4, 0],
+  workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
   subgroup = [2, 2],
   mma_kind = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F16>,
@@ -578,7 +578,7 @@ hal.executable public @main {
   mma_kind = #iree_gpu.mma_layout<WMMA_I32_16x16x16_I8>,
   reduction = [0, 0, 4],
   subgroup = [2, 4, 0],
-  workgroup = [4, 8, 0],
+  workgroup = [64, 128, 0],
   promote_operands = [0, 1]
 }>
 

Original file line number	Diff line number	Diff line change
`@@ -341,6 +341,9 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,`
`341`	`341`	`funcPassManager.addPass(createConvolutionToIGEMMPass());`
`342`	`342`	`}`
`343`	`343`
	`344`	`+ tileAndDistributeToWorkgroup(funcPassManager, /useForall=/true,`
	`345`	`+ /convertToDpsOptions=/std::nullopt);`
	`346`	`+`
`344`	`347`	`// Step 1. Promote matmul operands and pack to intrinsic shapes.`
`345`	`348`	`funcPassManager.addPass(createGPUPromoteMatmulOperandsPass());`
`346`	`349`	`funcPassManager.addPass(IREE::GPU::createPackToIntrinsicsPass());`
`@@ -357,9 +360,6 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,`
`357`	`360`	`}`
`358`	`361`	`funcPassManager.addPass(createPropagateReshapesByExpansionPass());`
`359`	`362`
`360`		`- tileAndDistributeToWorkgroup(funcPassManager, /useForall=/true,`
`361`		`- /convertToDpsOptions=/std::nullopt);`
`362`		`-`
`363`	`363`	`// Step 2. Tile and fuse tileable ops to reduction loops.`
`364`	`364`	`{`
`365`	`365`	`GPUApplyTilingLevelPassOptions options;`