Skip to content

Commit f30fdd9

Browse files
committed
[LV] Optimize VPWidenIntOrFpInductionRecipe for known TC
Optimize the IR generated for a VPWidenIntOrFpInductionRecipe to use the narrowest type necessary, when the trip-count of a loop is known to be constant and the only use of the recipe is the condition used by the vector loop's backedge branch.
1 parent 5862a87 commit f30fdd9

13 files changed

+4515
-154
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1802,6 +1802,9 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
18021802
VPSlotTracker &SlotTracker) const override;
18031803
#endif
18041804

1805+
/// Update the step value of the recipe.
1806+
void setStepValue(VPValue *V) { setOperand(1, V); }
1807+
18051808
VPValue *getVFValue() { return getOperand(2); }
18061809
const VPValue *getVFValue() const { return getOperand(2); }
18071810

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 90 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "VPlanPatternMatch.h"
2121
#include "VPlanUtils.h"
2222
#include "VPlanVerifier.h"
23+
#include "llvm/ADT/APInt.h"
2324
#include "llvm/ADT/PostOrderIterator.h"
2425
#include "llvm/ADT/STLExtras.h"
2526
#include "llvm/ADT/SetVector.h"
@@ -29,6 +30,8 @@
2930
#include "llvm/Analysis/VectorUtils.h"
3031
#include "llvm/IR/Intrinsics.h"
3132
#include "llvm/IR/PatternMatch.h"
33+
#include "llvm/Support/Casting.h"
34+
#include "llvm/Support/TypeSize.h"
3235

3336
using namespace llvm;
3437

@@ -1086,11 +1089,74 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
10861089
}
10871090
}
10881091

1089-
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1090-
unsigned BestUF,
1091-
PredicatedScalarEvolution &PSE) {
1092-
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1093-
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1092+
/// Optimize the width of vector induction variables in \p Plan based on a known
1093+
/// constant Trip Count, \p BestVF and \p BestUF.
1094+
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
1095+
ElementCount BestVF,
1096+
unsigned BestUF) {
1097+
// Only proceed if we have not completely removed the vector region.
1098+
if (!Plan.getVectorLoopRegion())
1099+
return false;
1100+
1101+
auto *TC = dyn_cast_if_present<ConstantInt>(
1102+
Plan.getTripCount()->getUnderlyingValue());
1103+
if (!TC || !BestVF.isFixed())
1104+
return false;
1105+
1106+
// Calculate the widest type required for known TC, VF and UF.
1107+
auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1108+
auto AlignedTC =
1109+
Align * APIntOps::RoundingUDiv(TC, APInt(TC.getBitWidth(), Align),
1110+
APInt::Rounding::UP);
1111+
auto MaxVal = AlignedTC - 1;
1112+
return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1113+
};
1114+
unsigned NewBitWidth =
1115+
ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1116+
1117+
LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
1118+
auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1119+
1120+
bool MadeChange = false;
1121+
1122+
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1123+
for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1124+
auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1125+
if (!WideIV || !WideIV->isCanonical() ||
1126+
WideIV->hasMoreThanOneUniqueUser() ||
1127+
NewIVTy == WideIV->getScalarType())
1128+
continue;
1129+
1130+
// Currently only handle cases where the single user is a header-mask
1131+
// comparison with the backedge-taken-count.
1132+
using namespace VPlanPatternMatch;
1133+
if (!match(*WideIV->user_begin(),
1134+
m_Binary<Instruction::ICmp>(
1135+
m_Specific(WideIV),
1136+
m_Specific(Plan.getOrCreateBackedgeTakenCount()))))
1137+
continue;
1138+
1139+
// Update IV operands and comparison bound to use new narrower type.
1140+
auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
1141+
WideIV->setStartValue(NewStart);
1142+
auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
1143+
WideIV->setStepValue(NewStep);
1144+
1145+
auto *NewBTC = new VPWidenCastRecipe(
1146+
Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1147+
Plan.getVectorPreheader()->appendRecipe(NewBTC);
1148+
auto *Cmp = dyn_cast<VPInstruction>(*WideIV->user_begin());
1149+
Cmp->setOperand(1, NewBTC);
1150+
1151+
MadeChange = true;
1152+
}
1153+
1154+
return MadeChange;
1155+
}
1156+
1157+
bool VPlanTransforms::simplifyBranchConditionForVFAndUF(
1158+
VPlan &Plan, ElementCount BestVF, unsigned BestUF,
1159+
PredicatedScalarEvolution &PSE) {
10941160
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
10951161
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
10961162
auto *Term = &ExitingVPBB->back();
@@ -1103,7 +1169,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
11031169
if (!match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
11041170
!match(Term,
11051171
m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))))
1106-
return;
1172+
return false;
11071173

11081174
ScalarEvolution &SE = *PSE.getSE();
11091175
const SCEV *TripCount =
@@ -1114,7 +1180,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
11141180
const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
11151181
if (TripCount->isZero() ||
11161182
!SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1117-
return;
1183+
return false;
11181184

11191185
// The vector loop region only executes once. If possible, completely remove
11201186
// the region, otherwise replace the terminator controlling the latch with
@@ -1153,8 +1219,23 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
11531219

11541220
Term->eraseFromParent();
11551221

1156-
Plan.setVF(BestVF);
1157-
Plan.setUF(BestUF);
1222+
return true;
1223+
}
1224+
1225+
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1226+
unsigned BestUF,
1227+
PredicatedScalarEvolution &PSE) {
1228+
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1229+
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1230+
1231+
bool MadeChange =
1232+
simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1233+
MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1234+
1235+
if (MadeChange) {
1236+
Plan.setVF(BestVF);
1237+
Plan.setUF(BestUF);
1238+
}
11581239
// TODO: Further simplifications are possible
11591240
// 1. Replace inductions with constants.
11601241
// 2. Replace vector loop region with VPBasicBlock.

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,13 @@ struct VPlanTransforms {
9898
unsigned BestUF,
9999
PredicatedScalarEvolution &PSE);
100100

101+
/// Try to simplify the branch condition of \p Plan. This may restrict the
102+
/// resulting plan to \p BestVF and \p BestUF.
103+
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan,
104+
ElementCount BestVF,
105+
unsigned BestUF,
106+
PredicatedScalarEvolution &PSE);
107+
101108
/// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
102109
/// optimizations, dead recipe removal, replicate region optimizations and
103110
/// block merging.

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -386,8 +386,8 @@ define void @latch_branch_cost(ptr %dst) {
386386
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
387387
; PRED: [[VECTOR_BODY]]:
388388
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
389-
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
390-
; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 99)
389+
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
390+
; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 99)
391391
; PRED-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
392392
; PRED-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
393393
; PRED: [[PRED_STORE_IF]]:
@@ -453,7 +453,7 @@ define void @latch_branch_cost(ptr %dst) {
453453
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE14]]
454454
; PRED: [[PRED_STORE_CONTINUE14]]:
455455
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
456-
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
456+
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
457457
; PRED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
458458
; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
459459
; PRED: [[MIDDLE_BLOCK]]:
@@ -790,9 +790,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
790790
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
791791
; DEFAULT: [[VECTOR_BODY]]:
792792
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
793-
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
793+
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
794794
; DEFAULT-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
795-
; DEFAULT-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6)
795+
; DEFAULT-NEXT: [[TMP1:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 6)
796796
; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
797797
; DEFAULT-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
798798
; DEFAULT: [[PRED_STORE_IF]]:
@@ -865,7 +865,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
865865
; DEFAULT-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
866866
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE14]]
867867
; DEFAULT: [[PRED_STORE_CONTINUE14]]:
868-
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
868+
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
869869
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
870870
; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
871871
; DEFAULT: [[MIDDLE_BLOCK]]:
@@ -892,9 +892,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
892892
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
893893
; PRED: [[VECTOR_BODY]]:
894894
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
895-
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
895+
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
896896
; PRED-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
897-
; PRED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6)
897+
; PRED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 6)
898898
; PRED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
899899
; PRED-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
900900
; PRED: [[PRED_STORE_IF]]:
@@ -967,7 +967,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
967967
; PRED-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
968968
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE14]]
969969
; PRED: [[PRED_STORE_CONTINUE14]]:
970-
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
970+
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
971971
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
972972
; PRED-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
973973
; PRED: [[MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ define void @func_21() {
1919
; CHECK: vector.body:
2020
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
2121
; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[PRED_STORE_CONTINUE4]] ]
22-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
22+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ <i8 0, i8 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
2323
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
2424
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
25-
; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], splat (i64 4)
25+
; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i8> [[VEC_IND]], splat (i8 4)
2626
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
2727
; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
2828
; CHECK: pred.load.if:
@@ -59,7 +59,7 @@ define void @func_21() {
5959
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
6060
; CHECK: pred.store.continue4:
6161
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
62-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
62+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], splat (i8 2)
6363
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6
6464
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
6565
; CHECK: middle.block:

llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ attributes #0 = { "target-cpu"="knl" }
8686
; FORCE-NEXT: br label [[VECTOR_BODY:%.*]]
8787
; FORCE: vector.body:
8888
; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
89-
; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
90-
; FORCE-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], splat (i32 2)
89+
; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ <i8 0, i8 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
90+
; FORCE-NEXT: [[TMP2:%.*]] = icmp ule <2 x i8> [[VEC_IND]], splat (i8 2)
9191
; FORCE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
9292
; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
9393
; FORCE: pred.store.if:
@@ -103,7 +103,7 @@ attributes #0 = { "target-cpu"="knl" }
103103
; FORCE-NEXT: br label [[PRED_STORE_CONTINUE4]]
104104
; FORCE: pred.store.continue2:
105105
; FORCE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
106-
; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
106+
; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], splat (i8 2)
107107
; FORCE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
108108
; FORCE-NEXT: br i1 [[TMP15]], label {{%.*}}, label [[VECTOR_BODY]]
109109
;

llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ define void @pr45679(ptr %A) optsize {
1818
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1919
; CHECK: vector.body:
2020
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
21-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
22-
; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i32> [[VEC_IND]], splat (i32 13)
21+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
22+
; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 13)
2323
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
2424
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
2525
; CHECK: pred.store.if:
@@ -53,7 +53,7 @@ define void @pr45679(ptr %A) optsize {
5353
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
5454
; CHECK: pred.store.continue6:
5555
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
56-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
56+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
5757
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
5858
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
5959
; CHECK: middle.block:
@@ -213,8 +213,8 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
213213
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
214214
; CHECK: vector.body:
215215
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
216-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
217-
; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 13)
216+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
217+
; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 13)
218218
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
219219
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
220220
; CHECK: pred.store.if:
@@ -252,7 +252,7 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
252252
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
253253
; CHECK: pred.store.continue6:
254254
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
255-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
255+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
256256
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
257257
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
258258
; CHECK: middle.block:

0 commit comments

Comments
 (0)