fix reduce loop indice

Xu, Xiaohui1 · Xu, Xiaohui1 · commit 9b6e6c88be52 · 2024-09-19T17:38:56.000+08:00
diff --git a/include/gc/Analysis/VectorBasedFusionAnalysis.h b/include/gc/Analysis/VectorBasedFusionAnalysis.h
@@ -48,7 +48,11 @@ class TypeHelper {
   int getDataTypeValidSteps(VectorType type);
   /// get vector \param type an even for loop step
   int generateValidSteps(int steps, VectorType type);
-  /// get vector \param type max simd length according to hardware information
+  /// get vector \param type an even for loop step when shape dimension is
+  /// shapeDim
+  int generateValidSteps(int steps, VectorType type, int shapeDim);
+  /// get vector \param type max simd length according to hardware
+  /// information
   int getDataTypeMAXSIMDLength(VectorType type);
   /// get operation's vector type
   VectorType getVectorzedType(Operation *op, uint32_t loopStep = 0);
diff --git a/lib/gc/Analysis/VectorBasedFusionAnalysis.cpp b/lib/gc/Analysis/VectorBasedFusionAnalysis.cpp
@@ -374,14 +374,24 @@ VectorType TypeHelper::getVectorzedType(Operation *op, uint32_t loopStep) {
   return VectorType::get({loopStep}, vectorizedType.getElementType());
 }
 
+int TypeHelper::generateValidSteps(int steps, VectorType type, int shapeDim) {
+  if (shapeDim & 1)
+    return 1;
+  auto typebits = type.getElementTypeBitWidth();
+  if (shapeDim >= steps)
+    return steps * typebits >= 128 ? steps : 1;
+  int evenStep = getNearestVectorStep(shapeDim);
+  return evenStep * typebits >= 128 ? evenStep : 1;
+}
+
 int TypeHelper::generateValidSteps(int steps, VectorType type) {
   // TODO: support odd shape using mask load store
   if (type.getShape().back() & 1)
     return 1;
+  auto typebits = type.getElementTypeBitWidth();
   if (type.getShape().back() >= steps)
-    return steps;
+    return steps * typebits >= 128 ? steps : 1;
   int evenStep = getNearestVectorStep(type.getShape().back());
-  auto typebits = type.getElementTypeBitWidth();
   return evenStep * typebits >= 128 ? evenStep : 1;
 }
 
diff --git a/lib/gc/Transforms/CPUPhysicalRegisterPass.cpp b/lib/gc/Transforms/CPUPhysicalRegisterPass.cpp
@@ -545,12 +545,6 @@ void updateCurrentArgsStatus(ValueRange loopState, const size_t loopStateIdx,
                              DenseMap<Value, Value> &nextOriginalOperandMap,
                              DenseMap<Value, Value> &nextOperandOriginalMap) {
   Value currentArgs = loopState[loopStateIdx];
-  if (currentArgs.getType() != originalValue.getType()) {
-    llvm::outs() << loopStateIdx << ","
-                 << "\n";
-    currentArgs.dump();
-    llvm::llvm_unreachable_internal("Type not equal.");
-  }
   nextAnchorArgs.emplace_back(currentArgs);
   nextAnchorArgsIdxMap[currentArgs] = nextAnchorArgs.size() - 1;
   nextOriginalOperandMap[originalValue] = currentArgs;
@@ -740,6 +734,36 @@ void updateLoopArgsData(Value val, Value originalVal,
   originalOperandLoopArgsMap[originalVal] = val;
 }
 
+void LoopGeneratorImpl::rectifyParallelIndice(
+    GenerateLoopHelper &loopHelperParam, OpBuilder &b, Location loc) {
+  MultiReductionCanonicalizer rdCanonicalizer =
+      getMultiRdCanonicalizers()[loopHelperParam.groupIdx];
+  auto &multireductionOp = rdCanonicalizer.getCandidateOps()[0];
+  SmallVector<int64_t, 4> &reductionAxis = rdCanonicalizer.getReductionAxis();
+
+  // rectify indice of read from source operand
+  auto sourceReadOp =
+      multireductionOp.getSource().getDefiningOp<vector::TransferReadOp>();
+  if (!sourceReadOp)
+    return;
+
+  AffineExpr outterParallel, innerParallel;
+  bindDims(multireductionOp->getContext(), outterParallel, innerParallel);
+
+  Value op =
+      loopHelperParam.inductionVars[loopHelperParam.inductionVars.size() -
+                                    reductionAxis.size() - 2];
+  Value ip =
+      loopHelperParam.inductionVars[loopHelperParam.inductionVars.size() -
+                                    reductionAxis.size() - 1];
+  Value newIndice = b.createOrFold<affine::AffineApplyOp>(
+      loc, (outterParallel + innerParallel), ValueRange{op, ip});
+  int parallelSize = rdCanonicalizer.getParallelAxis().size();
+  int readIndiceOffset =
+      1 + rdCanonicalizer.getParallelAxis()[parallelSize - 1];
+  sourceReadOp->setOperand(readIndiceOffset, newIndice);
+}
+
 scf::ForOp LoopGeneratorImpl::reductionAxisGenerateForLoop(
     OpBuilder &opBuilder, const size_t reductionIdx,
     GenerateLoopHelper &loopHelperParam) {
@@ -755,18 +779,22 @@ scf::ForOp LoopGeneratorImpl::reductionAxisGenerateForLoop(
 
   const auto loc = multireductionOp->getLoc();
   SmallVector<int64_t, 4> &reductionAxis = rdCanonicalizer.getReductionAxis();
-  bool lastDimReduction = rdCanonicalizer.hasLastDimReduction();
   VectorType vectorType = rdCanonicalizer.getSourceType();
-  const int loopStep =
-      getVectorBasedFusion().getGroupMaxSteps()[loopHelperParam.groupIdx];
+  auto tpHelper = fusionStrategy.getTypeHelper();
+
+  int loopStep = tpHelper.generateValidSteps(
+      fusionStrategy.getTypeHelper().getDataTypeMAXSIMDLength(vectorType),
+      vectorType, vectorType.getShape()[reductionAxis[reductionIdx]]);
+  bool isLastDimReduction = rdCanonicalizer.getHasLastDimReduction();
+  loopStep = (reductionIdx == reductionAxis.size() - 1 && isLastDimReduction)
+                 ? loopStep
+                 : 1;
+
   func::FuncOp func = fusionStrategy.getFunction();
   IRRewriter rewriterOfFunc(func);
 
   Value zero = makeIndexArithConstantOp(opBuilder, loc, 0);
-  Value forSteps = makeIndexArithConstantOp(
-      opBuilder, loc,
-      (reductionIdx == reductionAxis.size() - 1 && lastDimReduction) ? loopStep
-                                                                     : 1);
+  Value forSteps = makeIndexArithConstantOp(opBuilder, loc, loopStep);
   Value numIter = makeIndexArithConstantOp(
       opBuilder, loc, vectorType.getShape()[reductionAxis[reductionIdx]]);
   scf::ForOp forOp = opBuilder.create<scf::ForOp>(
@@ -868,9 +896,12 @@ scf::ForOp LoopGeneratorImpl::reductionAxisGenerateForLoop(
           }
 
           rewriteOperationAsVectorize(b, loopHelperParam.groupIdx,
-                                      &movingOperation);
+                                      &movingOperation,
+                                      isLastDimReduction ? loopStep : 0);
           loopHelperParam.loopIterArgs = loopState;
           moveOperationsToCurrentForBody(b, movingOperation, loopHelperParam);
+          if (isLastDimReduction)
+            rectifyParallelIndice(loopHelperParam, b, loc);
           loopHelperParam.movedOps = &movingOperation;
           loopHelperParam.candidateOps = &opQueue;
 
@@ -1058,11 +1089,16 @@ scf::ForOp LoopGeneratorImpl::parallelAxisGenerateForLoop(
           // get accumualte value
           Attribute initValueAttr;
           getReductionInitAttr(multiReductionOp, initValueAttr);
-
+          SmallVector<int64_t, 4> &reductionAxis =
+              rdCanonicalizer.getReductionAxis();
+          TypeHelper tpHelper = fusionStrategy.getTypeHelper();
+          int loopStep = tpHelper.generateValidSteps(
+              tpHelper.getDataTypeMAXSIMDLength(vectorType), vectorType,
+              vectorType.getShape()[reductionAxis[reductionAxis.size() - 1]]);
           auto accVal = b.create<arith::ConstantOp>(
               loc, DenseElementsAttr::get(
                        fusionStrategy.getTypeHelper().getVectorzedType(
-                           multiReductionOp, dimSize),
+                           multiReductionOp, loopStep),
                        {initValueAttr}));
 
           // put accumulte val at first for loop args
@@ -1247,14 +1283,14 @@ void LoopGeneratorImpl::rearrageMultiReductionIR(
   DenseMap<size_t, size_t> varLoopIdxMap;
   VectorType groupVector =
       getVectorBasedFusion().getGroupBiggestRankVectorType()[grpIdx];
-  for (size_t i = 0; i < parallelAxis.size(); i++) {
+  for (size_t i = 0; i < parallelAxis.size(); i++)
     varLoopIdxMap[parallelAxis[i]] = i;
-  }
+
   size_t offset = rdCanonicalizer.hasLastDimReduction() ? 1 : 0;
   for (size_t i = parallelAxis.size() + offset;
-       i < groupVector.getRank() + offset; i++) {
+       i < groupVector.getRank() + offset; i++)
     varLoopIdxMap[reductionAxis[i - parallelAxis.size() - offset]] = i;
-  }
+
   while (!tmpSourceQ.empty()) {
     auto *curOp = tmpSourceQ.front();
     tmpSourceQ.pop();
@@ -2313,7 +2349,8 @@ void ForLoopGenerator::createNewConstantOp(
 
 /// Rewrite the operations in the group to vectorized form.
 void ForLoopGenerator::rewriteOperationAsVectorize(
-    OpBuilder &rewriter, size_t groupId, const std::queue<Operation *> *queue) {
+    OpBuilder &rewriter, size_t groupId, const std::queue<Operation *> *queue,
+    const size_t vectorizeStep) {
   const std::queue<Operation *> groupOps =
       !queue ? getVectorBasedFusion().getOpGroups()[groupId] : *queue;
 
@@ -2322,7 +2359,9 @@ void ForLoopGenerator::rewriteOperationAsVectorize(
   DenseMap<Operation *, AffineMap> &opPermuationMap =
       getVectorBasedFusion().getOpPermuationMap();
   std::queue<Operation *> transformQueue(groupOps);
-  size_t groupSteps = getVectorBasedFusion().getGroupMaxSteps()[groupId];
+  size_t groupSteps = vectorizeStep == 0
+                          ? getVectorBasedFusion().getGroupMaxSteps()[groupId]
+                          : vectorizeStep;
 
   while (!transformQueue.empty()) {
     Operation *op = transformQueue.front();
diff --git a/lib/gc/Transforms/TilingVector.hpp b/lib/gc/Transforms/TilingVector.hpp
@@ -354,7 +354,8 @@ class ForLoopGenerator {
   /// rewrite operation as vectorize IR in current operation group
   void
   rewriteOperationAsVectorize(OpBuilder &rewriter, size_t groupId,
-                              const std::queue<Operation *> *queue = nullptr);
+                              const std::queue<Operation *> *queue = nullptr,
+                              const size_t vectorizeStep = 0);
   /// Reimplementation of writing a tensor from a constant of denseElementattr.
   void createNewConstantOp(Operation *srcOp,
                            vector::TransferWriteOp *transferWriteOp,
@@ -489,6 +490,8 @@ class LoopGeneratorImpl : public ForLoopGenerator {
   scf::ForOp reductionAxisGenerateForLoop(OpBuilder &opBuilder,
                                           const size_t reductionIdx,
                                           GenerateLoopHelper &loopHelperParam);
+  void rectifyParallelIndice(GenerateLoopHelper &loopHelperParam, OpBuilder &b,
+                             Location loc);
   /// reduction operation parallel axis for loop
   scf::ForOp parallelAxisGenerateForLoop(OpBuilder &opBuilder,
                                          GenerateLoopHelper &loopHelperParam);
diff --git a/test/mlir/test/gc/Transforms/cpu-phyaical-register.mlir b/test/mlir/test/gc/Transforms/cpu-phyaical-register.mlir
@@ -10,6 +10,7 @@
 // CHECK-DAG: #[[map6:.*]] = affine_map<(d0, d1) -> (d0 floordiv 16 + d1 floordiv 16)>
 // CHECK-DAG: #[[map7:.*]] = affine_map<()[s0, s1] -> (s0 * 32 + s1)>
 // CHECK-DAG: #[[map8:.*]] = affine_map<()[s0, s1] -> (s0 * 16 + s1)>
+// CHECK-DAG: #[[map9:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
 
 
 
@@ -619,7 +620,8 @@ func.func @reduce_fuse_test12(%input: tensor<16x32x64xf32>,
 // CHECK: scf.for %[[arg4:.*]] = %[[C0]] to %[[C16]] step %[[C1]] iter_args(%[[arg5:.*]] = %[[READ1]]) -> (vector<16xf32>)
 // CHECK: scf.for %[[arg6:.*]] = %[[C0]] to %[[C32]] step %[[C1]] iter_args(%[[arg7:.*]] = %[[CST]]) -> (vector<16xf32>)
 // CHECK: scf.for %[[arg8:.*]] = %[[C0]] to %[[C64]] step %[[C16]] iter_args(%[[arg9:.*]] = %[[arg7]]) -> (vector<16xf32>)
-// CHECK: %[[READ2:.*]] = vector.transfer_read {{.*}}[%[[arg2]], %[[arg6]], %[[arg8]]], {{.*}} {in_bounds = [true]} : tensor<16x32x64xf32>, vector<16xf32>
+// CHECK: %[[APPLY0:.*]] = affine.apply #[[map9]](%[[arg2]], %[[arg4]])
+// CHECK: %[[READ2:.*]] = vector.transfer_read {{.*}}[%[[APPLY0]], %[[arg6]], %[[arg8]]], {{.*}} {in_bounds = [true]} : tensor<16x32x64xf32>, vector<16xf32>
 // CHECK: %[[ADD0:.*]] = arith.addf %[[READ2]], %[[arg9]] : vector<16xf32>
 // CHECK: %[[REDUCTION:.*]] = vector.reduction <add>, {{.*}} : vector<16xf32> into f32
 // CHECK: %[[INSERT:.*]] = vector.insert %[[REDUCTION]], %[[arg5]] [%[[arg4]]] : f32 into vector<16xf32>