[Codegen] Run LoopCoalescingPass at the end of warp reduce (#19950)

IanWood1 · web-flow · commit 06eaeadc0cc0 · 2025-02-11T21:01:58.000-08:00
I observed that this improved some of the punet dispatches using warp reduction from 80us to 60us ([example](https://gist.github.com/IanWood1/5b0e4fcb4e90a02525b94ea4347145f5)). However, I'm not seeing the improvement through the noise of CI. Locally, I saw ~0.7ms improvement to punet. --------- Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -943,6 +943,8 @@ void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager) {
       /*expandSubgroupReduction=*/true));
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
+  funcPassManager.addPass(affine::createLoopCoalescingPass());
+  funcPassManager.addPass(createCanonicalizerPass());
 }
 
 void addGPUPackUnPackPasses(OpPassManager &funcPassManager) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir
@@ -333,3 +333,83 @@ hal.executable private @matvec_fp16 {
 // CDNA3-COUNT-24:   gpu.shuffle xor
 //          CDNA3:   scf.if
 //          CDNA3:     vector.transfer_write {{.+}} : vector<8xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable public @multi_reduction {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @multi_reduction ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multi_reduction() {
+        %cst = arith.constant 0.000000e+00 : f32
+        %cst_0 = arith.constant 2.304000e+05 : f32
+        %cst_1 = arith.constant 9.99999974E-6 : f32
+        %c85483008 = arith.constant 85483008 : index
+        %c165416448 = arith.constant 165416448 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x60x3840xf16>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c165416448) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x60x3840xf32>>
+        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 32, 60, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x60x3840xf16>> -> tensor<2x32x60x3840xf16>
+        %3 = tensor.empty() : tensor<2x32x60x3840xf32>
+        %4 = tensor.empty() : tensor<2x32xf32>
+        %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<2x32x60x3840xf16>) outs(%3 : tensor<2x32x60x3840xf32>) {
+        ^bb0(%in: f16, %out: f32):
+          %11 = arith.extf %in : f16 to f32
+          linalg.yield %11 : f32
+        } -> tensor<2x32x60x3840xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
+        %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x60x3840xf32>) outs(%6 : tensor<2x32xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %11 = arith.addf %in, %out : f32
+          linalg.yield %11 : f32
+        } -> tensor<2x32xf32>
+        %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %11 = arith.divf %in, %cst_0 : f32
+          linalg.yield %11 : f32
+        } -> tensor<2x32xf32>
+        %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5, %8 : tensor<2x32x60x3840xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) {
+        ^bb0(%in: f32, %in_2: f32, %out: f32):
+          %11 = arith.subf %in, %in_2 : f32
+          %12 = arith.mulf %11, %11 : f32
+          %13 = arith.addf %12, %out : f32
+          linalg.yield %13 : f32
+        } -> tensor<2x32xf32>
+        %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2, %8, %9 : tensor<2x32x60x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%3 : tensor<2x32x60x3840xf32>) {
+        ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
+          %11 = arith.divf %in_3, %cst_0 : f32
+          %12 = arith.addf %11, %cst_1 : f32
+          %13 = math.rsqrt %12 : f32
+          %14 = arith.extf %in : f16 to f32
+          %15 = arith.subf %14, %in_2 : f32
+          %16 = arith.mulf %15, %13 : f32
+          linalg.yield %16 : f32
+        } -> tensor<2x32x60x3840xf32>
+        flow.dispatch.tensor.store %10, %1, offsets = [0, 0, 0, 0], sizes = [2, 32, 60, 3840], strides = [1, 1, 1, 1] : tensor<2x32x60x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x60x3840xf32>>
+        return
+      }
+    }
+  }
+}
+
+// Check that all loops are singly nested.
+//
+//          CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
+//          CHECK: func.func @multi_reduction()
+//     CHECK-SAME:     translation_info = #[[$TRANSLATION]]
+//      CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK-DAG:   %[[C225:.+]] = arith.constant 225 : index
+//          CHECK:   %[[RES0:.+]] = scf.for %[[ARG0:[a-zA-Z0-9]+]] = %[[C0]] to %[[C225]] step %[[C1]]
+//     CHECK-NEXT:     %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (15, 15) : index, index
+//          CHECK:   %[[RES1:.+]] = scf.for %[[ARG0:[a-zA-Z0-9]+]] = %[[C0]] to %[[C225]] step %[[C1]]
+//     CHECK-NEXT:     %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (15, 15) : index, index
+//          CHECK:   scf.for %[[ARG0:[a-zA-Z0-9]+]] = %[[C0]] to %[[C225]] step %[[C1]]
+//     CHECK-NEXT:     %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (15, 15) : index, index

Original file line number	Diff line number	Diff line change
`@@ -943,6 +943,8 @@ void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager) {`
`943`	`943`	`/expandSubgroupReduction=/true));`
`944`	`944`	`funcPassManager.addPass(createCanonicalizerPass());`
`945`	`945`	`funcPassManager.addPass(createCSEPass());`
	`946`	`+ funcPassManager.addPass(affine::createLoopCoalescingPass());`
	`947`	`+ funcPassManager.addPass(createCanonicalizerPass());`
`946`	`948`	`}`
`947`	`949`
`948`	`950`	`void addGPUPackUnPackPasses(OpPassManager &funcPassManager) {`