[Codegen][Tuner] Add root_op for matvec and reduction along VectorDistribute pipeline (#22348)

bangtianliu · web-flow · commit 54cd445cff79 · 2025-10-20T11:59:55.000-04:00
Context: While triaging the BOO tuner, I came across this bug " No root
ops found" and submitted a PR to fix it.

The `VectorDistribute` pipeline also supports reduction and matvec
operations through the `setReductionConfig()` function. This PR ensures
that the root_op attribute is correctly added along this configuration
path.

Once the tuner begins supporting matvec and reduction operations, this
PR will become directly useful.

---------

Signed-off-by: Bangtian Liu &lt;liubangtian@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ReductionConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ReductionConfigUtils.cpp
@@ -714,6 +714,9 @@ LogicalResult setReductionConfig(IREE::GPU::TargetAttr target,
       context, CodeGenPipeline::LLVMGPUVectorDistribute, SymbolRefAttr(),
       {workgroupSize, 1, 1}, subgroupSize, pipelineConfig);
 
+  if (clSetTunerAttr) {
+    setRootOpInfo(op);
+  }
   return setTranslationInfo(entryPoint, translationInfo);
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_root_op_attribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_root_op_attribute.mlir
@@ -10,3 +10,42 @@ func.func @matmul(%lhs: tensor<4x4xf32>, %rhs: tensor<4x4xf32>) -> tensor<4x4xf3
 }
 
 // CHECK: %2 = linalg.matmul {lowering_config = #{{.*}}, root_op} ins(%arg0, %arg1 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%1 : tensor<4x4xf32>) -> tensor<4x4xf32>
+
+// -----
+
+func.func @matvec(%matrix: tensor<32000x4096xf16>, %vector: tensor<4096xf16>, %init: tensor<32000xf16>) {
+  %output = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>]>) binding(0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32000xf16>>
+  %result = linalg.matvec ins(%matrix, %vector : tensor<32000x4096xf16>, tensor<4096xf16>) outs(%init : tensor<32000xf16>) -> tensor<32000xf16>
+  iree_tensor_ext.dispatch.tensor.store %result, %output, offsets = [0], sizes = [32000], strides = [1] : tensor<32000xf16> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32000xf16>>
+  return
+}
+
+// CHECK: #translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
+// CHECK-LABEL: func.func @matvec
+// CHECK: linalg.matvec
+// CHECK-SAME: lowering_config = #iree_gpu.lowering_config
+// CHECK-SAME: root_op
+
+// -----
+
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+
+func.func @reduction_sum(%input: tensor<2x32x128x4096xf32>, %init: tensor<2x32xf32>) {
+  %output = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>]>) binding(0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32xf32>>
+  %result = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
+      ins(%input : tensor<2x32x128x4096xf32>) outs(%init : tensor<2x32xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %add = arith.addf %in, %out : f32
+    linalg.yield %add : f32
+  } -> tensor<2x32xf32>
+  iree_tensor_ext.dispatch.tensor.store %result, %output, offsets = [0, 0], sizes = [2, 32], strides = [1, 1] : tensor<2x32xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32xf32>>
+  return
+}
+
+// CHECK: #translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
+// CHECK-LABEL: func.func @reduction_sum
+// CHECK: %{{.*}} = linalg.generic
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "reduction"]
+// CHECK-SAME: lowering_config = #iree_gpu.lowering_config
+// CHECK-SAME: root_op

Original file line number	Diff line number	Diff line change
`@@ -714,6 +714,9 @@ LogicalResult setReductionConfig(IREE::GPU::TargetAttr target,`
`714`	`714`	`context, CodeGenPipeline::LLVMGPUVectorDistribute, SymbolRefAttr(),`
`715`	`715`	`{workgroupSize, 1, 1}, subgroupSize, pipelineConfig);`
`716`	`716`
	`717`	`+ if (clSetTunerAttr) {`
	`718`	`+ setRootOpInfo(op);`
	`719`	`+ }`
`717`	`720`	`return setTranslationInfo(entryPoint, translationInfo);`
`718`	`721`	`}`
`719`	`722`