Skip to content

Commit 1afba19

Browse files
committed
[VPlan] Try to narrow wide and replicating recipes to uniform recipes.
Use the existing VPlan-based analysis to identify recipes that only have their first lane demanded and transform them to uniform recpliate recipes. This simplifies the generated code in some places and prepares for fixing #122496.
1 parent 16aa400 commit 1afba19

File tree

4 files changed

+35
-18
lines changed

4 files changed

+35
-18
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -596,11 +596,36 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
596596
if (!PhiR)
597597
continue;
598598

599+
// Try to narrow wide and replicating recipes to uniform recipes, based on
600+
// VPlan analysis.
601+
// TODO: Apply to all recipes in the future, to replace legacy uniformity
602+
// analysis.
603+
auto Users = collectUsersRecursively(PhiR);
604+
for (VPUser *U : reverse(Users)) {
605+
auto *Def = dyn_cast<VPSingleDefRecipe>(U);
606+
auto *RepR = dyn_cast<VPReplicateRecipe>(U);
607+
// Skip recipes that shouldn't be narrowed.
608+
if (!isa<VPReplicateRecipe, VPWidenRecipe>(Def) || !Def ||
609+
Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
610+
(RefR && (RepR->isUniform() || RepRr->isPredicated())))
611+
continue;
612+
613+
// Skip recipes that may have other lanes than their first used.
614+
if (!vputils::isUniformAfterVectorization(Def) &&
615+
!vputils::onlyFirstLaneUsed(Def))
616+
continue;
617+
618+
auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
619+
Def->operands(), /*IsUniform*/ true);
620+
Clone->insertAfter(Def);
621+
Def->replaceAllUsesWith(Clone);
622+
}
623+
599624
// Check if any uniform VPReplicateRecipes using the phi recipe are used by
600625
// ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to
601626
// ensure the final value is available.
602627
// TODO: Remove once uniformity analysis is done on VPlan.
603-
for (VPUser *U : collectUsersRecursively(PhiR)) {
628+
for (VPUser *U : Users) {
604629
auto *ExitIRI = dyn_cast<VPIRInstruction>(U);
605630
VPValue *Op;
606631
if (!ExitIRI || !match(ExitIRI->getOperand(0),

llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,6 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
132132
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
133133
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
134134
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
135-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[CONV6]], i64 0
136-
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
137135
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
138136
; CHECK: [[VECTOR_BODY]]:
139137
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -142,9 +140,9 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
142140
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 0
143141
; CHECK-NEXT: [[TMP22:%.*]] = icmp ule <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
144142
; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i1> zeroinitializer
145-
; CHECK-NEXT: [[TMP24:%.*]] = select <vscale x 2 x i1> [[TMP23]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> splat (i64 1)
146-
; CHECK-NEXT: [[TMP25:%.*]] = sdiv <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP24]]
147-
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <vscale x 2 x i64> [[TMP25]], i32 0
143+
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <vscale x 2 x i1> [[TMP23]], i32 0
144+
; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CONV6]], i64 1
145+
; CHECK-NEXT: [[TMP26:%.*]] = sdiv i64 [[M]], [[TMP25]]
148146
; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
149147
; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP26]], [[CONV61]]
150148
; CHECK-NEXT: [[TMP29:%.*]] = sub i64 [[TMP21]], [[TMP28]]

llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,12 @@ define void @gep_use_in_dead_block(ptr noalias %dst, ptr %src) {
1212
; CHECK: [[VECTOR_BODY]]:
1313
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
1414
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
15-
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
16-
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
17-
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
1815
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]]
1916
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0
2017
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
2118
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 10)
2219
; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
2320
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP0]]
24-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP1]]
25-
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP2]]
26-
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP3]]
2721
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[TMP8]], i32 0
2822
; CHECK-NEXT: call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP12]], i32 2, <4 x i1> [[TMP7]])
2923
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4

llvm/test/Transforms/LoopVectorize/scalable-assume.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
44
; CHECK-LABEL: @test1(
55
; CHECK: vector.body:
6-
; CHECK: [[FCMP1:%.*]] = fcmp ogt <vscale x 2 x float>
7-
; CHECK-NEXT: [[FCMP2:%.*]] = fcmp ogt <vscale x 2 x float>
8-
; CHECK-NEXT: [[FCMP1L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP1]], i32 0
9-
; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP1L0]])
10-
; CHECK-NEXT: [[FCMP2L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP2]], i32 0
11-
; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP2L0]])
6+
; CHECK: [[E1:%.*]] = extractelement <vscale x 2 x float> {{.+}}, i32 0
7+
; CHECK-NEXT: [[FCMP1:%.*]] = fcmp ogt float [[E1]]
8+
; CHECK-NEXT: [[E2:%.*]] = extractelement <vscale x 2 x float> {{.+}}, i32 0
9+
; CHECK-NEXT: [[FCMP2:%.*]] = fcmp ogt float [[E2]]
10+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP1]])
11+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP2]])
1212
entry:
1313
br label %for.body
1414

0 commit comments

Comments
 (0)