[CodeGen] Fix gather fusion on vector distribute path (iree-org#21117)

pashu123 · web-flow · commit e43555e7da19 · 2025-07-01T20:22:45.000+05:30
Don't attach lowering config to the gather operation. Let it fuse with the consumer operation. This eliminates the creation of a temporary buffer. The change restricts element-wise fusion in the case of softmax with a gather-like operation. Fixes iree-org#21107
diff --git a/compiler/src/iree/compiler/Codegen/Common/DecomposeSoftmax.cpp b/compiler/src/iree/compiler/Codegen/Common/DecomposeSoftmax.cpp
@@ -45,7 +45,13 @@ struct FuseElementWiseGenericOps : public OpRewritePattern<linalg::GenericOp> {
     for (OpOperand &opOperand : genericOp->getOpOperands()) {
       if (!linalg::areElementwiseOpsFusable(&opOperand))
         continue;
-
+      // Don't fuse if it has external capture. For e.g., the gather like
+      // payload operation like 'tensor.extract' would be cloned in
+      // every consumer op, which is not what we want.
+      auto producer = opOperand.get().getDefiningOp<linalg::GenericOp>();
+      if (producer && hasExternalCapture(producer)) {
+        continue;
+      }
       FailureOr<linalg::ElementwiseOpFusionResult> fusionResult =
           linalg::fuseElementwiseOps(rewriter, &opOperand);
       if (succeeded(fusionResult)) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/RematerializeParallelOps.cpp b/compiler/src/iree/compiler/Codegen/Common/RematerializeParallelOps.cpp
@@ -25,47 +25,6 @@ static bool isScalarOrTensorOfSizeOne(Type t) {
   return t.isIntOrIndexOrFloat();
 }
 
-///  This function checks whether the `genericOp` has any external captures,
-///  i.e., whether it uses any values that are defined outside of its body.
-///  %10 = linalg.generic {indexing_maps = [#map, #map],
-///          iterator_types = ["parallel", "parallel"]}
-///         ins(%5 : tensor<4096x64xi64>) outs(%9 : tensor<4096x64xf16>) {
-///          ^bb0(%in: i64, %out: f16):
-///            %14 = linalg.index 0 : index
-///            %15 = arith.index_cast %in : i64 to index
-///            %extracted = tensor.extract %4[%14, %15] : tensor<4096x64xf16>
-///            linalg.yield %extracted : f16
-///           } -> tensor<4096x64xf16>
-///  Here %4 is an external capture used via tensor.extract inside
-///  linalg.generic hence the above `genericOp` has an external capture.
-static bool hasExternalCapture(linalg::GenericOp genericOp) {
-  Block &body = genericOp.getRegion().front();
-  for (Operation &op : body.getOperations()) {
-    for (Value operand : op.getOperands()) {
-      if (auto bArg = dyn_cast<BlockArgument>(operand)) {
-        // Check whether the operand lies in the same block.
-        if (bArg.getOwner() == &body) {
-          continue;
-        }
-        return true;
-      }
-      Operation *defOp = operand.getDefiningOp();
-      // Scalar constant is allowed.
-      if (defOp && defOp->hasTrait<mlir::OpTrait::ConstantLike>()) {
-        Type type = operand.getType();
-        if (type.isIntOrFloat() || type.isIndex()) {
-          continue;
-        }
-      }
-      // If defining op is not inside the block, it’s an external value.
-      if (!defOp || defOp->getBlock() != &body) {
-        return true;
-      }
-    }
-  }
-  return false; // All operands are locally defined or block arguments.
-}
-
 /// Rematerialize all parallel elementwise operations into its users within a
 /// `flow.dispatch.region`.
 struct RematerializeParallelOpsPattern
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/decompose_softmax.mlir b/compiler/src/iree/compiler/Codegen/Common/test/decompose_softmax.mlir
@@ -82,3 +82,24 @@ func.func @softmax(%arg0: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> {
 // CHECK-NO-FUSE:        } -> tensor<2x16x32xf32>
 // CHECK-NO-FUSE:        return %[[D7]] : tensor<2x16x32xf32>
 // CHECK-NO-FUSE:      }
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+func.func @do_not_fuse_gather(%arg0: tensor<4096x64xi64>, %arg1: tensor<4096x64xf32>) -> tensor<4096x64xf32> {
+  %empty = tensor.empty() : tensor<4096x64xf32>
+  %0 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<4096x64xi64>) outs(%empty : tensor<4096x64xf32>) {
+  ^bb0(%in: i64, %out: f32):
+    %3 = linalg.index 0 : index
+    %4 = arith.index_cast %in : i64 to index
+    %extracted = tensor.extract %arg1[%3, %4] : tensor<4096x64xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<4096x64xf32>
+  %s_empty = tensor.empty() : tensor<4096x64xf32>
+  %1 = linalg.softmax dimension(1) ins(%0 : tensor<4096x64xf32>) outs(%s_empty: tensor<4096x64xf32>) -> tensor<4096x64xf32>
+  return %1 : tensor<4096x64xf32>
+}
+//   CHECK-LABEL: func @do_not_fuse_gather(
+//         CHECK:    linalg.generic {{.*}}
+//         CHECK:      tensor.extract {{.*}} : tensor<4096x64xf32>
+// CHECK-COUNT-3:    linalg.generic
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -544,6 +544,11 @@ populateConfigInfo(const llvm::SetVector<linalg::LinalgOp> &computeOps,
   // LinalgOp with only parallel dims. This is needed if the op cannot be fused
   // with a reduction or introduces new loop dimensions.
   auto shouldAttachLoweringConfig = [&](linalg::LinalgOp linalgOp) -> bool {
+    // If the operation has a gather, we want to fuse it with the
+    // reduction.
+    if (hasExternalCapture(cast<linalg::GenericOp>(linalgOp))) {
+      return false;
+    }
     // If some of the users are in computeOps and some are outside of
     // computeOps; attach lowering config, since the op can't be fused.
     if (llvm::any_of(linalgOp->getUsers(),
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx942.mlir
@@ -251,3 +251,43 @@ func.func @test_multiple_stores(%arg0: !iree_tensor_ext.dispatch.tensor<readonly
 //  CHECK-SAME:               subgroup_basis = {{\[}}[1, 16], [0, 1]],
 //  CHECK-SAME:               thread = [0, 4], thread_basis = {{\[}}[1, 64], [0, 1]],
 //  CHECK-SAME:               workgroup = [1, 0]
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+// Test to not add lowering to gather like operation.
+func.func @test_gather_config(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<4096xi64>>, %arg1: !iree_tensor_ext.dispatch.tensor<readonly:tensor<4096x64xf32>>, %arg2: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4096xf32>>) {
+  %c2_i64 = arith.constant 2 : i64
+  %c0 = arith.constant 0 : index
+  %load1 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [4096], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4096xi64>> -> tensor<4096xi64>
+  %load2 = iree_tensor_ext.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [4096, 64], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4096x64xf32>> -> tensor<4096x64xf32>
+    %0 = tensor.empty() : tensor<4096x64xf32>
+    %1 = linalg.generic {indexing_maps = [#map1, #map], iterator_types = ["parallel", "parallel"]} ins(%load1 : tensor<4096xi64>) outs(%0 : tensor<4096x64xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %4 = linalg.index 0 : index
+      %5 = linalg.index 1 : index
+      %extracted = tensor.extract %load2[%4, %5] : tensor<4096x64xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<4096x64xf32>
+    %2 = tensor.empty() : tensor<4096xf32>
+    %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%1 : tensor<4096x64xf32>) outs(%2 : tensor<4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %4 = arith.addf %in, %out : f32
+      linalg.yield %4 : f32
+    } -> tensor<4096xf32>
+  iree_tensor_ext.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4096xf32>>
+  return
+}
+//      CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64
+//      CHECK: func.func @test_gather_config
+// CHECK-SAME:     translation_info = #[[$TRANSLATION]]
+//      CHECK:   linalg.generic
+//  CHECK-NOT:      attrs =  {lowering_config = #iree_gpu.lowering_config<{
+//      CHECK:    linalg.yield
+//      CHECK:   linalg.generic
+// CHECK-SAME:      attrs =  {lowering_config = #iree_gpu.lowering_config<{
+// CHECK-SAME:               partial_reduction = [0, 64],
+// CHECK-SAME:               subgroup_basis = {{\[}}[1, 1], [0, 1]],
+// CHECK-SAME:               thread = [0, 1], thread_basis = {{\[}}[1, 64], [0, 1]],
+// CHECK-SAME:               workgroup = [1, 0]
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
@@ -1941,4 +1941,32 @@ bool neverRunsSecondIteration(scf::ForOp op) {
   return isUbUnderStep.value_or(false) && isLbNonNegative.value_or(false);
 }
 
+bool hasExternalCapture(linalg::GenericOp genericOp) {
+  Block &body = genericOp.getRegion().front();
+  for (Operation &op : body.getOperations()) {
+    for (Value operand : op.getOperands()) {
+      if (auto bArg = dyn_cast<BlockArgument>(operand)) {
+        // Check whether the operand lies in the same block.
+        if (bArg.getOwner() == &body) {
+          continue;
+        }
+        return true;
+      }
+      Operation *defOp = operand.getDefiningOp();
+      // Scalar constant is allowed.
+      if (defOp && defOp->hasTrait<mlir::OpTrait::ConstantLike>()) {
+        Type type = operand.getType();
+        if (type.isIntOrFloat() || type.isIndex()) {
+          continue;
+        }
+      }
+      // If defining op is not inside the block, it’s an external value.
+      if (!defOp || defOp->getBlock() != &body) {
+        return true;
+      }
+    }
+  }
+  return false; // All operands are locally defined or block arguments.
+}
+
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
@@ -331,6 +331,21 @@ bool alwaysRunsFirstIteration(scf::ForOp op);
 /// the ForOp.
 bool neverRunsSecondIteration(scf::ForOp op);
 
+///  This function checks whether the `genericOp` has any external captures,
+///  i.e., whether it uses any values that are defined outside of its body.
+///  %10 = linalg.generic {indexing_maps = [#map, #map],
+///          iterator_types = ["parallel", "parallel"]}
+///         ins(%5 : tensor<4096x64xi64>) outs(%9 : tensor<4096x64xf16>) {
+///          ^bb0(%in: i64, %out: f16):
+///            %14 = linalg.index 0 : index
+///            %15 = arith.index_cast %in : i64 to index
+///            %extracted = tensor.extract %4[%14, %15] : tensor<4096x64xf16>
+///            linalg.yield %extracted : f16
+///           } -> tensor<4096x64xf16>
+///  Here %4 is an external capture used via tensor.extract inside
+///  linalg.generic hence the above `genericOp` has an external capture.
+bool hasExternalCapture(linalg::GenericOp genericOp);
+
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_CODEGEN_UTILS_UTILS_H_