Move SCEV expansion so it's done after we know if we're tail-folding or not

SamTebbs33 · SamTebbs33 · commit 2a351ed7ee51 · 2024-08-06T17:25:18.000+01:00
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -363,7 +363,7 @@ class LoopVectorizationPlanner {
   /// loop iteration.
   std::optional<VectorizationFactor>
   plan(ElementCount UserVF, unsigned UserIC,
-       SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask);
+       std::optional<ArrayRef<PointerDiffInfo>> DiffChecks, std::function<Value*(const SCEV *)> Expander, bool &HasAliasMask);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6906,8 +6906,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 
 std::optional<VectorizationFactor>
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
-                               SmallVector<PointerDiffInfoValues> RTChecks,
-                               bool &HasAliasMask) {
+                               std::optional<ArrayRef<PointerDiffInfo>> RTChecks,
+                               std::function<Value*(const SCEV*)> Expander, bool &HasAliasMask) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
@@ -6916,6 +6916,18 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return std::nullopt;
 
+  // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
+  // here and put them into a list.
+  SmallVector<PointerDiffInfoValues> DiffChecksValues;
+  if (RTChecks.has_value()
+      && useActiveLaneMask(CM.getTailFoldingStyle(true))) {
+    for (auto Check : *RTChecks) {
+      Value *Sink = Expander(Check.SinkStart);
+      Value *Src = Expander(Check.SrcStart);
+      DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
+    }
+  }
+
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(TTI)) {
@@ -6944,7 +6956,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
     CM.collectInLoopReductions();
     if (CM.selectUserVectorizationFactor(UserVF)) {
       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-      buildVPlansWithVPRecipes(UserVF, UserVF, RTChecks, HasAliasMask);
+      buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecksValues, HasAliasMask);
       if (!hasPlanWithVF(UserVF)) {
         LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
                           << ".\n");
@@ -6979,9 +6991,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
   }
 
   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
-                           RTChecks, HasAliasMask);
+                           DiffChecksValues, HasAliasMask);
   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
-                           RTChecks, HasAliasMask);
+                           DiffChecksValues, HasAliasMask);
 
   LLVM_DEBUG(printPlans(dbgs()));
   if (VPlans.empty())
@@ -9907,24 +9919,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getDataLayout(),
                            AddBranchWeights);
 
-  // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
-  // here and put them into a list.
-  std::optional<ArrayRef<PointerDiffInfo>> DiffChecks =
-      LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks();
-  SmallVector<PointerDiffInfoValues> DiffChecksValues;
-  if (DiffChecks.has_value() &&
-      useActiveLaneMask(CM.getTailFoldingStyle(true))) {
-    Instruction *Loc = L->getLoopPreheader()->getTerminator();
-    for (auto Check : *DiffChecks) {
-      Value *Sink = Checks.expandCodeForMemCheck(Check.SinkStart, Loc);
-      Value *Src = Checks.expandCodeForMemCheck(Check.SrcStart, Loc);
-      DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
-    }
-  }
-
   // Plan how to best vectorize, return the best VF and its cost.
+  auto Expand = [&Checks, &L](const SCEV *S) {
+    return Checks.expandCodeForMemCheck(S, L->getLoopPreheader()->getTerminator());
+  };
   std::optional<VectorizationFactor> MaybeVF =
-      LVP.plan(UserVF, UserIC, DiffChecksValues, Checks.HasAliasMask);
+      LVP.plan(UserVF, UserIC, LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(), Expand, Checks.HasAliasMask);
 
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -153,7 +153,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
 ; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
-; PRED-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
@@ -163,6 +163,13 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; PRED-NEXT:    [[SUB_DIFF:%.*]] = sub i64 [[SRC2]], [[DST1]]
+; PRED-NEXT:    [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
+; PRED-NEXT:    [[NEG_COMPARE:%.*]] = icmp slt i64 [[DIFF]], 0
+; PRED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
+; PRED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
@@ -177,9 +184,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP30:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP30]], <vscale x 8 x i8> poison)
 ; PRED-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
 ; PRED-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
 ; PRED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
@@ -188,16 +196,19 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
 ; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[TMP30]])
+; PRED-NEXT:    [[TMP31:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
+; PRED-NEXT:    [[TMP32:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP31]])
+; PRED-NEXT:    [[TMP33:%.*]] = zext i8 [[TMP32]] to i64
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP33]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
 ; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
 ; PRED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; PRED-NEXT:    br label [[LOOP:%.*]]
 ; PRED:       loop:
 ; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -9,8 +9,8 @@ define void @pred_loop(ptr %off, ptr %data, ptr %dst, i32 %n) #0 {
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add = add nuw nsw i32 %i.09, 1
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %data, i32 %add
-; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i32, ptr %arrayidx, align 4
-; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add1 = add nsw i32 %0, 5
+; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i32, ptr %arrayidx, align 4
+; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %add1 = add nsw i32 %1, 5
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx2 = getelementptr inbounds i32, ptr %dst, i32 %i.09
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add1, ptr %arrayidx2, align 4
 ; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %add, %n