[GPU] Add rematerialize parallel ops in the vector distribute pipeline (#21073)

pashu123 · web-flow · commit f59480f2bf59 · 2025-06-11T09:22:04.000-07:00
This enables elementwise op fusion and some of cases might benefit from this. Fix for the issue: #20875
diff --git a/compiler/src/iree/compiler/Codegen/Common/RematerializeParallelOps.cpp b/compiler/src/iree/compiler/Codegen/Common/RematerializeParallelOps.cpp
@@ -25,6 +25,47 @@ static bool isScalarOrTensorOfSizeOne(Type t) {
   return t.isIntOrIndexOrFloat();
 }
 
+///  This function checks whether the `genericOp` has any external captures,
+///  i.e., whether it uses any values that are defined outside of its body.
+///  %10 = linalg.generic {indexing_maps = [#map, #map],
+///          iterator_types = ["parallel", "parallel"]}
+///         ins(%5 : tensor<4096x64xi64>) outs(%9 : tensor<4096x64xf16>) {
+///          ^bb0(%in: i64, %out: f16):
+///            %14 = linalg.index 0 : index
+///            %15 = arith.index_cast %in : i64 to index
+///            %extracted = tensor.extract %4[%14, %15] : tensor<4096x64xf16>
+///            linalg.yield %extracted : f16
+///           } -> tensor<4096x64xf16>
+///  Here %4 is an external capture used via tensor.extract inside
+///  linalg.generic hence the above `genericOp` has an external capture.
+static bool hasExternalCapture(linalg::GenericOp genericOp) {
+  Block &body = genericOp.getRegion().front();
+  for (Operation &op : body.getOperations()) {
+    for (Value operand : op.getOperands()) {
+      if (auto bArg = dyn_cast<BlockArgument>(operand)) {
+        // Check whether the operand lies in the same block.
+        if (bArg.getOwner() == &body) {
+          continue;
+        }
+        return true;
+      }
+      Operation *defOp = operand.getDefiningOp();
+      // Scalar constant is allowed.
+      if (defOp && defOp->hasTrait<mlir::OpTrait::ConstantLike>()) {
+        Type type = operand.getType();
+        if (type.isIntOrFloat() || type.isIndex()) {
+          continue;
+        }
+      }
+      // If defining op is not inside the block, it’s an external value.
+      if (!defOp || defOp->getBlock() != &body) {
+        return true;
+      }
+    }
+  }
+  return false; // All operands are locally defined or block arguments.
+}
+
 /// Rematerialize all parallel elementwise operations into its users within a
 /// `flow.dispatch.region`.
 struct RematerializeParallelOpsPattern
@@ -44,9 +85,13 @@ struct RematerializeParallelOpsPattern
 
     // Find the first operand that is defined by another generic op on tensors.
     for (OpOperand &opOperand : genericOp->getOpOperands()) {
-      if (!linalg::areElementwiseOpsFusable(&opOperand))
+      if (!linalg::areElementwiseOpsFusable(&opOperand)) {
         continue;
-
+      }
+      auto producer = opOperand.get().getDefiningOp<linalg::GenericOp>();
+      if (producer && hasExternalCapture(producer)) {
+        continue;
+      }
       FailureOr<linalg::ElementwiseOpFusionResult> fusionResult =
           linalg::fuseElementwiseOps(rewriter, &opOperand);
       if (succeeded(fusionResult)) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/rematerialize_parallel_ops.mlir b/compiler/src/iree/compiler/Codegen/Common/test/rematerialize_parallel_ops.mlir
@@ -138,3 +138,34 @@ func.func @no_rematerialize_scalar_ops(%arg0 : tensor<f32>) -> tensor<f32> {
 //       CHECK:   linalg.generic
 //       CHECK:   linalg.generic
 //       CHECK:   linalg.generic
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
+// Do not fuse generic that has external caputure.
+func.func @no_external_capture_fusion(%arg0: tensor<4096x64xi64>, %arg1: tensor<4096x64xf16>, %arg2: tensor<4096x64xf16>, %arg3: f32, %arg4: tensor<4096x4096xf32>) -> tensor<4096x4096xf32> {
+  %empty = tensor.empty() : tensor<4096x64xf16>
+  %0 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<4096x64xi64>) outs(%arg1 : tensor<4096x64xf16>) {
+  ^bb0(%in: i64, %out: f16):
+    %3 = linalg.index 0 : index
+    %4 = arith.index_cast %in : i64 to index
+    %extracted = tensor.extract %empty[%3, %4] : tensor<4096x64xf16>
+    linalg.yield %extracted : f16
+  } -> tensor<4096x64xf16>
+  %1 = linalg.fill ins(%arg3 : f32) outs(%arg4 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32>
+  %2 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg2, %0 : tensor<4096x64xf16>, tensor<4096x64xf16>) outs(%1 : tensor<4096x4096xf32>) {
+  ^bb0(%in: f16, %in_0: f16, %out: f32):
+    %3 = arith.extf %in : f16 to f32
+    %4 = arith.extf %in_0 : f16 to f32
+    %5 = arith.mulf %3, %4 : f32
+    %6 = arith.addf %out, %5 : f32
+    linalg.yield %6 : f32
+  } -> tensor<4096x4096xf32>
+  return %2 : tensor<4096x4096xf32>
+}
+// CHECK-LABEL: func @no_external_capture_fusion(
+//       CHECK:   linalg.generic
+//       CHECK:   linalg.generic
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -865,6 +865,9 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
                                /*convertToDpsOptions=*/std::nullopt,
                                /*reorderStrategy=*/reorderStrategy);
 
+  // Some of the elementwise fusion can benefit from this pass.
+  funcPassManager.addPass(createRematerializeParallelOpsPass());
+
   if (usePadToModelSharedMemcpy) {
     funcPassManager.addPass(createLLVMGPUPromoteMatmulToFitMMAPass());
   }

Original file line number	Diff line number	Diff line change
`@@ -865,6 +865,9 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,`
`865`	`865`	`/convertToDpsOptions=/std::nullopt,`
`866`	`866`	`/reorderStrategy=/reorderStrategy);`
`867`	`867`
	`868`	`+ // Some of the elementwise fusion can benefit from this pass.`
	`869`	`+ funcPassManager.addPass(createRematerializeParallelOpsPass());`
	`870`	`+`
`868`	`871`	`if (usePadToModelSharedMemcpy) {`
`869`	`872`	`funcPassManager.addPass(createLLVMGPUPromoteMatmulToFitMMAPass());`
`870`	`873`	`}`