update reduction rectify indice code

Xu, Xiaohui1 · Xu, Xiaohui1 · commit 919dd11cb001 · 2024-09-23T14:29:33.000+08:00
diff --git a/include/gc/Transforms/Utils/VectorUtils.h b/include/gc/Transforms/Utils/VectorUtils.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include <limits>
+#include <queue>
 #include <stdint.h>
 #include <variant>
 
@@ -151,6 +152,36 @@ T getInitValForReduce(vector::CombiningKind kind, Type t) {
   return result;
 }
 
+template <typename TARGETOP>
+void getSameBlockTargetOp(Operation *op,
+                          std::queue<Operation *> &candidateOps) {
+  if (isa<TARGETOP>(op)) {
+    candidateOps.push(op);
+    return;
+  }
+  auto getSameBlockSrcOp = [](Operation *trackSrcOp,
+                              std::queue<Operation *> &trackOps,
+                              std::queue<Operation *> &candidateOps) {
+    for (Value opd : trackSrcOp->getOperands()) {
+      if (isa<BlockArgument>(opd) or
+          opd.getDefiningOp()->getBlock() != trackSrcOp->getBlock())
+        continue;
+      if (isa<TARGETOP>(opd.getDefiningOp()))
+        candidateOps.push(opd.getDefiningOp());
+      else
+        trackOps.push(opd.getDefiningOp());
+    }
+  };
+
+  std::queue<Operation *> trackOps;
+  getSameBlockSrcOp(op, trackOps, candidateOps);
+  while (not trackOps.empty()) {
+    Operation *cadidateOp = trackOps.front();
+    trackOps.pop();
+    getSameBlockSrcOp(cadidateOp, trackOps, candidateOps);
+  }
+}
+
 } // namespace gc
 } // namespace mlir
 
diff --git a/lib/gc/Transforms/CPUPhysicalRegisterPass.cpp b/lib/gc/Transforms/CPUPhysicalRegisterPass.cpp
@@ -735,33 +735,37 @@ void updateLoopArgsData(Value val, Value originalVal,
 }
 
 void LoopGeneratorImpl::rectifyParallelIndice(
-    GenerateLoopHelper &loopHelperParam, OpBuilder &b, Location loc) {
+    GenerateLoopHelper &loopHelperParam, Location loc) {
   MultiReductionCanonicalizer rdCanonicalizer =
       getMultiRdCanonicalizers()[loopHelperParam.groupIdx];
   auto &multireductionOp = rdCanonicalizer.getCandidateOps()[0];
   SmallVector<int64_t, 4> &reductionAxis = rdCanonicalizer.getReductionAxis();
 
   // rectify indice of read from source operand
-  auto sourceReadOp =
-      multireductionOp.getSource().getDefiningOp<vector::TransferReadOp>();
-  if (!sourceReadOp)
-    return;
-
-  AffineExpr outterParallel, innerParallel;
-  bindDims(multireductionOp->getContext(), outterParallel, innerParallel);
-
-  Value op =
-      loopHelperParam.inductionVars[loopHelperParam.inductionVars.size() -
-                                    reductionAxis.size() - 2];
-  Value ip =
-      loopHelperParam.inductionVars[loopHelperParam.inductionVars.size() -
-                                    reductionAxis.size() - 1];
-  Value newIndice = b.createOrFold<affine::AffineApplyOp>(
-      loc, (outterParallel + innerParallel), ValueRange{op, ip});
-  int parallelSize = rdCanonicalizer.getParallelAxis().size();
-  int readIndiceOffset =
-      1 + rdCanonicalizer.getParallelAxis()[parallelSize - 1];
-  sourceReadOp->setOperand(readIndiceOffset, newIndice);
+  std::queue<Operation *> candidateOps;
+  getSameBlockTargetOp<vector::TransferReadOp>(
+      multireductionOp.getSource().getDefiningOp(), candidateOps);
+  while (not candidateOps.empty()) {
+    auto sourceReadOp = candidateOps.front();
+    candidateOps.pop();
+    IRRewriter rewriter(sourceReadOp);
+    rewriter.setInsertionPoint(sourceReadOp);
+    AffineExpr outterParallel, innerParallel;
+    bindDims(multireductionOp->getContext(), outterParallel, innerParallel);
+
+    Value op =
+        loopHelperParam.inductionVars[loopHelperParam.inductionVars.size() -
+                                      reductionAxis.size() - 2];
+    Value ip =
+        loopHelperParam.inductionVars[loopHelperParam.inductionVars.size() -
+                                      reductionAxis.size() - 1];
+    Value newIndice = rewriter.createOrFold<affine::AffineApplyOp>(
+        loc, (outterParallel + innerParallel), ValueRange{op, ip});
+    int parallelSize = rdCanonicalizer.getParallelAxis().size();
+    int readIndiceOffset =
+        1 + rdCanonicalizer.getParallelAxis()[parallelSize - 1];
+    sourceReadOp->setOperand(readIndiceOffset, newIndice);
+  }
 }
 
 scf::ForOp LoopGeneratorImpl::reductionAxisGenerateForLoop(
@@ -901,7 +905,7 @@ scf::ForOp LoopGeneratorImpl::reductionAxisGenerateForLoop(
           loopHelperParam.loopIterArgs = loopState;
           moveOperationsToCurrentForBody(b, movingOperation, loopHelperParam);
           if (isLastDimReduction)
-            rectifyParallelIndice(loopHelperParam, b, loc);
+            rectifyParallelIndice(loopHelperParam, loc);
           loopHelperParam.movedOps = &movingOperation;
           loopHelperParam.candidateOps = &opQueue;
 
@@ -2768,7 +2772,7 @@ void GroupOperationFusionImpl::broadcastFromElements(Operation *op,
           DenseElementsAttr::get(dataType, constantOp.getValue()),
           newOperandType);
       if (failed(res))
-        llvm::llvm_unreachable_internal("Wrong to create constant op.");
+        llvm_unreachable("Wrong to create constant op.");
       removeOpInCurrentGroups(grpIdx, op, res.value().getDefiningOp());
 
     } else {
diff --git a/lib/gc/Transforms/TilingVector.hpp b/lib/gc/Transforms/TilingVector.hpp
@@ -490,8 +490,7 @@ class LoopGeneratorImpl : public ForLoopGenerator {
   scf::ForOp reductionAxisGenerateForLoop(OpBuilder &opBuilder,
                                           const size_t reductionIdx,
                                           GenerateLoopHelper &loopHelperParam);
-  void rectifyParallelIndice(GenerateLoopHelper &loopHelperParam, OpBuilder &b,
-                             Location loc);
+  void rectifyParallelIndice(GenerateLoopHelper &loopHelperParam, Location loc);
   /// reduction operation parallel axis for loop
   scf::ForOp parallelAxisGenerateForLoop(OpBuilder &opBuilder,
                                          GenerateLoopHelper &loopHelperParam);
diff --git a/test/mlir/test/gc/Transforms/cpu-phyaical-register.mlir b/test/mlir/test/gc/Transforms/cpu-phyaical-register.mlir
@@ -577,7 +577,8 @@ func.func @reduce_fusePostOp_test11(%input: tensor<16x32x64xf32>,
 // CHECK: %[[READ0:.*]] = vector.transfer_read %[[arg5]][%[[arg2]], %[[arg4]]], %[[CST_0]] {in_bounds = [true]} : tensor<16x32xf32>, vector<16xf32> 
 // CHECK: scf.for %[[arg6:.*]] = %[[C0]] to %[[C16]] step %[[C1]] iter_args(%[[arg7:.*]] = %[[READ0]]) -> (vector<16xf32>)
 // CHECK: scf.for %[[arg8:.*]] = %[[C0]] to %[[C64]] step %[[C16]] iter_args(%[[arg9:.*]] = %[[CST]]) -> (vector<16xf32>)
-// CHECK: %[[READ1:.*]] = vector.transfer_read %arg0[%[[arg2]], %[[arg4]], %[[arg8]]], {{.*}} {in_bounds = [true]} : tensor<16x32x64xf32>, vector<16xf32>
+// CHECK: %[[APPLY0:.*]] = affine.apply #[[map9]](%[[arg4]], %[[arg6]])
+// CHECK: %[[READ1:.*]] = vector.transfer_read %arg0[%[[arg2]], %[[APPLY0]], %[[arg8]]], {{.*}} {in_bounds = [true]} : tensor<16x32x64xf32>, vector<16xf32>
 // CHECK: %[[ADD0:.*]] = arith.addf %[[READ1]], %[[READ1]] : vector<16xf32>
 // CHECK: %[[ADD1:.*]] = arith.addf %[[ADD0]], %[[arg9]] : vector<16xf32>
 // CHECK: %[[REDUCTION:.*]] = vector.reduction <add>, {{.*}} : vector<16xf32> into f32