Skip to content

Commit ef52b83

Browse files
committed
!fixup address comments, thanks!
1 parent 49dc5d5 commit ef52b83

File tree

5 files changed

+98
-78
lines changed

5 files changed

+98
-78
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 63 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -781,78 +781,83 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
781781
return false;
782782

783783
// Collect recipes that need hoisting.
784-
SmallVector<VPRecipeBase *> WorkList;
784+
SmallVector<VPRecipeBase *> HoistCandidates;
785785
SmallPtrSet<VPRecipeBase *, 8> Seen;
786-
VPBasicBlock *HoistBlock = FOR->getParent();
787-
auto HoistPoint = HoistBlock->getFirstNonPhi();
788-
auto TryToPushHoistCandidate = [&](VPRecipeBase *HoistCandidate) {
789-
// If we reach FOR, it means the original Previous depends on some other
790-
// recurrence that in turn depends on FOR. If that is the case, we would
791-
// also need to hoist recipes involving the other FOR, which may break
792-
// dependencies.
793-
if (HoistCandidate == FOR)
794-
return false;
795-
796-
// Hoist candidate outside any region, no need to hoist.
797-
if (!HoistCandidate->getParent()->getParent())
798-
return true;
799-
800-
// Hoist candidate is a header phi or already visited, no need to hoist.
801-
if (isa<VPHeaderPHIRecipe>(HoistCandidate) ||
802-
!Seen.insert(HoistCandidate).second)
803-
return true;
786+
VPRecipeBase *HoistPoint = nullptr;
787+
// Find the closest hoist point by looking at all users of FOR and selecting
788+
// the recipe dominating all other users.
789+
for (VPUser *U : FOR->users()) {
790+
auto *R = dyn_cast<VPRecipeBase>(U);
791+
if (!R)
792+
continue;
793+
if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
794+
HoistPoint = R;
795+
}
804796

805-
// If we reached a recipe that dominates all users of FOR, we don't need to
797+
auto NeedsHoisting = [HoistPoint, &VPDT,
798+
&Seen](VPValue *HoistCandidateV) -> VPRecipeBase * {
799+
VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
800+
if (!HoistCandidate)
801+
return nullptr;
802+
assert((!HoistCandidate->getParent()->getParent() ||
803+
HoistCandidate->getParent()->getParent() ==
804+
HoistCandidate->getParent()->getEnclosingLoopRegion()) &&
805+
"CFG in VPlan should still be flattened, without replicate regions");
806+
// Hoist candidate has already beend visited, no need to hoist.
807+
if (!Seen.insert(HoistCandidate).second)
808+
return nullptr;
809+
810+
// Candidate is outside loop region or a header phi, dominates FOR users w/o
811+
// hoisting.
812+
if (!HoistCandidate->getParent()->getEnclosingLoopRegion() ||
813+
isa<VPHeaderPHIRecipe>(HoistCandidate))
814+
return nullptr;
815+
816+
// If we reached a recipe that dominates HoistPoint, we don't need to
806817
// hoist the recipe.
807-
if (all_of(FOR->users(), [&VPDT, HoistCandidate](VPUser *U) {
808-
return VPDT.properlyDominates(HoistCandidate, cast<VPRecipeBase>(U));
809-
})) {
810-
if (VPDT.properlyDominates(&*HoistPoint, HoistCandidate)) {
811-
// This HoistCandidate domiantes all users of FOR and is closer to them
812-
// than the previous HoistPoint.
813-
HoistPoint = std::next(HoistCandidate->getIterator());
814-
HoistBlock = HoistCandidate->getParent();
815-
}
816-
return true;
817-
}
818-
819-
// Don't move candiates with sideeffects, as we do not yet analyze recipes
820-
// between candidate and hoist destination yet.
821-
if (HoistCandidate->mayHaveSideEffects())
822-
return false;
823-
824-
WorkList.push_back(HoistCandidate);
825-
return true;
818+
if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
819+
return nullptr;
820+
return HoistCandidate;
821+
};
822+
auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
823+
// Avoid hoisting candidates with side-effects, as we do not yet analyze
824+
// associated dependencies.
825+
return !HoistCandidate->mayHaveSideEffects();
826826
};
827827

828828
// Recursively try to hoist Previous and its operands before all users of FOR.
829-
// Update HoistPoint to the closest recipe that dominates all users of FOR.
830-
if (!TryToPushHoistCandidate(Previous))
831-
return false;
829+
if (NeedsHoisting(Previous->getVPSingleValue()))
830+
HoistCandidates.push_back(Previous);
832831

833-
for (unsigned I = 0; I != WorkList.size(); ++I) {
834-
VPRecipeBase *Current = WorkList[I];
832+
for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
833+
VPRecipeBase *Current = HoistCandidates[I];
835834
assert(Current->getNumDefinedValues() == 1 &&
836835
"only recipes with a single defined value expected");
836+
if (!CanHoist(Current))
837+
return false;
837838

838-
for (VPValue *Op : Current->operands())
839-
if (auto *R = Op->getDefiningRecipe())
840-
if (!TryToPushHoistCandidate(R))
841-
return false;
839+
for (VPValue *Op : Current->operands()) {
840+
// If we reach FOR, it means the original Previous depends on some other
841+
// recurrence that in turn depends on FOR. If that is the case, we would
842+
// also need to hoist recipes involving the other FOR, which may break
843+
// dependencies.
844+
if (Op == FOR)
845+
return false;
846+
847+
if (auto *R = NeedsHoisting(Op))
848+
HoistCandidates.push_back(R);
849+
}
842850
}
843851

844852
// Keep recipes to hoist ordered by dominance so earlier instructions are
845853
// processed first.
846-
sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
854+
sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
847855
return VPDT.properlyDominates(A, B);
848856
});
849857

850-
for (VPRecipeBase *HoistCandidate : WorkList) {
851-
if (HoistPoint == HoistCandidate->getIterator()) {
852-
HoistPoint = std::next(HoistCandidate->getIterator());
853-
continue;
854-
}
855-
HoistCandidate->moveBefore(*HoistBlock, HoistPoint);
858+
for (VPRecipeBase *HoistCandidate : HoistCandidates) {
859+
HoistCandidate->moveBefore(*HoistPoint->getParent(),
860+
HoistPoint->getIterator());
856861
}
857862

858863
return true;
@@ -881,9 +886,9 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
881886
Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
882887
}
883888

884-
if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT))
885-
if (!hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
886-
return false;
889+
if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
890+
!hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
891+
return false;
887892

888893
// Introduce a recipe to combine the incoming and previous values of a
889894
// fixed-order recurrence.

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,13 @@ struct VPlanTransforms {
3636
GetIntOrFpInductionDescriptor,
3737
ScalarEvolution &SE, const TargetLibraryInfo &TLI);
3838

39-
/// Try to move users of fixed-order recurrences after the recipe defining
40-
/// their previous value, either by sinking them or hoisting the recipe
39+
/// Try to have all users of fixed-order recurrences appear after the recipe
40+
/// defining their previous value, by either sinking or hoisting the recipe
4141
/// defining their previous value (and its operands). Then introduce
4242
/// FirstOrderRecurrenceSplice VPInstructions to combine the value from the
4343
/// recurrence phis and previous values. The current implementation assumes
44-
/// all users can be sunk after the previous value, which is enforced by
45-
/// earlier legality checks.
44+
/// all users can be sunk after the previous value, or the previous value can
45+
/// be hoisted before all users, which is enforced by earlier legality checks.
4646
/// \returns true if all users of fixed-order recurrences could be re-arranged
4747
/// as needed or false if it is not possible. In the latter case, \p Plan is
4848
/// not valid.

llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,18 @@ exit:
280280
}
281281

282282
; Test for https://github.com/llvm/llvm-project/issues/106523.
283+
; %for.2 requires no code motion, as its previous (%or) precedes its (first)
284+
; user (store). Furthermore, its user cannot sink, being a store.
285+
;
286+
; %for.1 requires code motion, as its previous (%trunc) follows its (first)
287+
; user (%or). Sinking %or past %trunc seems possible, as %or has no uses
288+
; (except for feeding %for.2; worth strengthening VPlan's dce?). However, %or
289+
; is both the user of %for.1 and the previous of %for.2, and we refrain from
290+
; sinking instructions that act as previous because they (may) serve points to
291+
; sink after.
292+
293+
; Instead, %for.1 can be reconciled by hoisting its previous above its user
294+
; %or, as this user %trunc depends only on %iv.
283295
define void @for_iv_trunc_optimized(ptr %dst) {
284296
; CHECK-LABEL: @for_iv_trunc_optimized(
285297
; CHECK-NEXT: bb:
@@ -294,8 +306,8 @@ define void @for_iv_trunc_optimized(ptr %dst) {
294306
; CHECK-NEXT: [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
295307
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
296308
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
297-
; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer
298-
; CHECK-NEXT: [[TMP3]] = or <4 x i32> [[TMP1]], zeroinitializer
309+
; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[TMP0]], <i32 3, i32 3, i32 3, i32 3>
310+
; CHECK-NEXT: [[TMP3]] = or <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
299311
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
300312
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
301313
; CHECK-NEXT: store i32 [[TMP6]], ptr [[DST:%.*]], align 4
@@ -316,7 +328,7 @@ define void @for_iv_trunc_optimized(ptr %dst) {
316328
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[ADD:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
317329
; CHECK-NEXT: [[FOR_1:%.*]] = phi i32 [ [[TRUNC:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
318330
; CHECK-NEXT: [[FOR_2:%.*]] = phi i32 [ [[OR:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT4]], [[SCALAR_PH]] ]
319-
; CHECK-NEXT: [[OR]] = or i32 [[FOR_1]], 0
331+
; CHECK-NEXT: [[OR]] = or i32 [[FOR_1]], 3
320332
; CHECK-NEXT: [[ADD]] = add i64 [[IV]], 1
321333
; CHECK-NEXT: store i32 [[FOR_2]], ptr [[DST]], align 4
322334
; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[IV]], 337
@@ -329,10 +341,10 @@ bb:
329341
br label %loop
330342

331343
loop:
332-
%iv = phi i64 [ %add, %loop ], [ 1, %bb ]
333-
%for.1 = phi i32 [ %trunc, %loop ], [ 1, %bb ]
334-
%for.2 = phi i32 [ %or, %loop ], [ 0, %bb ]
335-
%or = or i32 %for.1, 0
344+
%iv = phi i64 [ 1, %bb ], [ %add, %loop ]
345+
%for.1 = phi i32 [ 1, %bb ], [ %trunc, %loop ]
346+
%for.2 = phi i32 [ 0, %bb ], [ %or, %loop ]
347+
%or = or i32 %for.1, 3
336348
%add = add i64 %iv, 1
337349
store i32 %for.2, ptr %dst, align 4
338350
%icmp = icmp ult i64 %iv, 337

llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ exit:
149149
; This test has two FORs (for.x and for.y) where incoming value from the previous
150150
; iteration (for.x.prev) of one FOR (for.y) depends on another FOR (for.x).
151151
; Sinking would require moving a recipe with side effects (store). Instead,
152-
; for.x.prev can be hoisted.
152+
; for.x.next can be hoisted.
153153
define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
154154
; CHECK-LABEL: 'test_chained_first_order_recurrences_4'
155155
; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
@@ -167,10 +167,10 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
167167
; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.x> = phi ir<0>, ir<%for.x.next>
168168
; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.y> = phi ir<0>, ir<%for.x.prev>
169169
; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
170+
; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%base>, vp<[[SCALAR_STEPS]]>
170171
; CHECK-NEXT: EMIT vp<[[SPLICE_X:%.]]> = first-order splice ir<%for.x>, ir<%for.x.next>
171172
; CHECK-NEXT: WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32
172173
; CHECK-NEXT: EMIT vp<[[SPLICE_Y:%.+]]> = first-order splice ir<%for.y>, ir<%for.x.prev>
173-
; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%base>, vp<[[SCALAR_STEPS]]>
174174
; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64
175175
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
176176
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64>

llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,10 @@ exit:
382382
ret void
383383
}
384384

385-
define void @hoist_previous_value_and_operands(ptr %dst, i64 %mask) {
385+
; Similar to the test cases for https://github.com/llvm/llvm-project/issues/106523.
386+
; The previous truncation (%trunc) gets vectorized (rather than folded into an
387+
; IV) and hoisted along with its AND operand above the user 'or'.
388+
define void @hoist_previous_value_and_operand(ptr %dst, i64 %mask) {
386389
; CHECK-LABEL: @hoist_previous_value_and_operands(
387390
; CHECK-NEXT: bb:
388391
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -397,13 +400,13 @@ define void @hoist_previous_value_and_operands(ptr %dst, i64 %mask) {
397400
; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
398401
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
399402
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
403+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP0]]
404+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
400405
; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
401406
; CHECK-NEXT: [[TMP2]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
402407
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
403408
; CHECK-NEXT: [[TMP4]] = or <4 x i32> [[TMP3]], zeroinitializer
404409
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR1]], <4 x i32> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
405-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP0]]
406-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
407410
; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
408411
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
409412
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
@@ -437,10 +440,10 @@ bb:
437440
br label %loop
438441

439442
loop:
440-
%iv = phi i64 [ %add, %loop ], [ 1, %bb ]
441-
%for.1 = phi i32 [ %trunc, %loop ], [ 1, %bb ]
442-
%for.2 = phi i32 [ %or, %loop ], [ 0, %bb ]
443-
%or = or i32 %for.1, 0
443+
%iv = phi i64 [ 1, %bb ], [ %add, %loop ]
444+
%for.1 = phi i32 [ 1, %bb ], [ %trunc, %loop ]
445+
%for.2 = phi i32 [ 0, %bb ], [ %or, %loop ]
446+
%or = or i32 %for.1, 3
444447
%add = add i64 %iv, 1
445448
%gep = getelementptr inbounds i32, ptr %dst, i64 %iv
446449
store i32 %for.2, ptr %gep, align 4

0 commit comments

Comments
 (0)