Skip to content

Commit 8fbc0c5

Browse files
committed
[LV] Check Addr in getAddressAccessSCEV in terms of SCEV expressions.
getAddressAccessSCEV previously had some restrictive checks that limited pointer SCEV expressions passed to TTI to GEPs with operands that must either be invariant or marked as inductions. As a consequence, the check rejected things like `GEP %base, (%iv + 1)`, while the SCEV for the GEP should be as easily analyzeable as for `GEP %base, %v`, with the only difference being the of the AddRec start adjusted by 1. This patch changes the code to use a SCEV-based check, limiting the address SCEV to be loop invariant, an affine AddRec (i.e. induction ), or an add expression of such operands or a sign-extended AddRec. This catches all existing cases getAddressAccessSCEV caught, plus additional ones like the cases mentioned above. This means we pass address SCEVs in more cases, giving the backends a better change to make informed decisions. It also unifies the decision when to use an address SCEV between the legacy and VPlan-based cost model. An illustrative example of showing the impact are the gather-cost.ll tests. Previously they were considered not profitable to vectorize because we failed to determine that %gep.src_data = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %mul has a relatively small constant stride. There may be some rough edges in the cost models, where not passing pointer SCEVs hid some incorrect modeling, but those issues should be fixed in the target cost models if they surface.
1 parent e3905a4 commit 8fbc0c5

File tree

59 files changed

+1938
-1347
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+1938
-1347
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5196,19 +5196,9 @@ static const SCEV *getAddressAccessSCEV(
51965196
if (!Gep)
51975197
return nullptr;
51985198

5199-
// We are looking for a gep with all loop invariant indices except for one
5200-
// which should be an induction variable.
5201-
auto *SE = PSE.getSE();
5202-
unsigned NumOperands = Gep->getNumOperands();
5203-
for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5204-
Value *Opd = Gep->getOperand(Idx);
5205-
if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5206-
!Legal->isInductionVariable(Opd))
5207-
return nullptr;
5208-
}
5209-
5210-
// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5211-
return PSE.getSCEV(Ptr);
5199+
const SCEV *Addr = PSE.getSCEV(Ptr);
5200+
return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), TheLoop) ? Addr
5201+
: nullptr;
52125202
}
52135203

52145204
InstructionCost

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "llvm/Analysis/AssumptionCache.h"
2424
#include "llvm/Analysis/IVDescriptors.h"
2525
#include "llvm/Analysis/LoopInfo.h"
26+
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
2627
#include "llvm/IR/BasicBlock.h"
2728
#include "llvm/IR/IRBuilder.h"
2829
#include "llvm/IR/Instruction.h"
@@ -3123,15 +3124,11 @@ static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
31233124
match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
31243125
return nullptr;
31253126

3126-
// We are looking for a GEP where all indices are either loop invariant or
3127-
// inductions.
3128-
for (VPValue *Opd : drop_begin(PtrR->operands())) {
3129-
if (!Opd->isDefinedOutsideLoopRegions() &&
3130-
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
3131-
return nullptr;
3132-
}
3127+
const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, SE, L);
3128+
if (isa<SCEVCouldNotCompute>(Addr))
3129+
return Addr;
31333130

3134-
return vputils::getSCEVExprForVPValue(Ptr, SE, L);
3131+
return vputils::isAddressSCEVForCost(Addr, SE, L) ? Addr : nullptr;
31353132
}
31363133

31373134
/// Returns true if \p V is used as part of the address of another load or

llvm/lib/Transforms/Vectorize/VPlanUtils.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@
1313
#include "llvm/ADT/TypeSwitch.h"
1414
#include "llvm/Analysis/MemoryLocation.h"
1515
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
16+
#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
1617

1718
using namespace llvm;
1819
using namespace llvm::VPlanPatternMatch;
20+
using namespace llvm::SCEVPatternMatch;
1921

2022
bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
2123
return all_of(Def->users(),
@@ -150,6 +152,23 @@ const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
150152
.Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });
151153
}
152154

155+
bool vputils::isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE,
156+
const Loop *L) {
157+
// If address is an SCEVAddExpr, all operands must be either be invariant or a
158+
// (possibly sign-extend) affine AddRec.
159+
if (auto *PtrAdd = dyn_cast<SCEVAddExpr>(Addr)) {
160+
return all_of(PtrAdd->operands(), [&SE, L](const SCEV *Op) {
161+
return SE.isLoopInvariant(Op, L) ||
162+
match(Op, m_scev_SExt(m_scev_AffineAddRec(m_SCEV(), m_SCEV()))) ||
163+
match(Op, m_scev_AffineAddRec(m_SCEV(), m_SCEV()));
164+
});
165+
}
166+
167+
// Otherwise, check if address is loop invariant or an affine add recurrence.
168+
return SE.isLoopInvariant(Addr, L) ||
169+
match(Addr, m_scev_AffineAddRec(m_SCEV(), m_SCEV()));
170+
}
171+
153172
/// Returns true if \p Opcode preserves uniformity, i.e., if all operands are
154173
/// uniform, the result will also be uniform.
155174
static bool preservesUniformity(unsigned Opcode) {

llvm/lib/Transforms/Vectorize/VPlanUtils.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr);
4242
const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE,
4343
const Loop *L = nullptr);
4444

45+
/// Returns true if \p Addr is an address SCEV that can be passed to
46+
/// TTI::getAddressComputationCost, i.e. the address SCEV is loop invariant, an
47+
/// affine AddRec (i.e. induction ), or an add expression of such operands or a
48+
/// sign-extended AddRec.
49+
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L);
50+
4551
/// Returns true if \p VPV is a single scalar, either because it produces the
4652
/// same value for all lanes or only has its first lane used.
4753
bool isSingleScalar(const VPValue *VPV);

llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll

Lines changed: 214 additions & 21 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll

Lines changed: 119 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,122 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
99
@kernel4 = global [512 x float] zeroinitializer, align 4
1010
@src_data = global [1536 x float] zeroinitializer, align 4
1111

12-
; We don't want to vectorize most loops containing gathers because they are
13-
; expensive.
14-
; Make sure we don't vectorize it.
12+
; The cost of gathers in the loop gets offset by the vector math.
1513

1614
define float @_Z4testmm(i64 %size, i64 %offset) {
1715
; CHECK-LABEL: define float @_Z4testmm(
1816
; CHECK-SAME: i64 [[SIZE:%.*]], i64 [[OFFSET:%.*]]) {
1917
; CHECK-NEXT: [[ENTRY:.*]]:
18+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 4
19+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
20+
; CHECK: [[VECTOR_PH]]:
21+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SIZE]], 4
22+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SIZE]], [[N_MOD_VF]]
23+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[OFFSET]], i64 0
24+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
25+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
26+
; CHECK: [[VECTOR_BODY]]:
27+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
28+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
29+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
30+
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[VECTOR_BODY]] ]
31+
; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP70:%.*]], %[[VECTOR_BODY]] ]
32+
; CHECK-NEXT: [[TMP75:%.*]] = add <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
33+
; CHECK-NEXT: [[TMP76:%.*]] = mul <4 x i64> [[TMP75]], splat (i64 3)
34+
; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i64> [[TMP76]], i32 0
35+
; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i64> [[TMP76]], i32 1
36+
; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i64> [[TMP76]], i32 2
37+
; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i64> [[TMP76]], i32 3
38+
; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP77]]
39+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP78]]
40+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP79]]
41+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP80]]
42+
; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP81]], align 4
43+
; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
44+
; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP8]], align 4
45+
; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP9]], align 4
46+
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i32 0
47+
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP11]], i32 1
48+
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP12]], i32 2
49+
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP13]], i32 3
50+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 [[INDEX]]
51+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
52+
; CHECK-NEXT: [[TMP19:%.*]] = fmul fast <4 x float> [[TMP17]], [[WIDE_LOAD]]
53+
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 [[INDEX]]
54+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP20]], align 4
55+
; CHECK-NEXT: [[TMP21:%.*]] = fmul fast <4 x float> [[TMP19]], [[WIDE_LOAD3]]
56+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 [[INDEX]]
57+
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
58+
; CHECK-NEXT: [[TMP23:%.*]] = fmul fast <4 x float> [[TMP21]], [[WIDE_LOAD4]]
59+
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 [[INDEX]]
60+
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
61+
; CHECK-NEXT: [[TMP25:%.*]] = fmul fast <4 x float> [[TMP23]], [[WIDE_LOAD5]]
62+
; CHECK-NEXT: [[TMP26]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP25]]
63+
; CHECK-NEXT: [[TMP27:%.*]] = add <4 x i64> [[TMP76]], splat (i64 1)
64+
; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP27]], i32 0
65+
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP27]], i32 1
66+
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP27]], i32 2
67+
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP27]], i32 3
68+
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP28]]
69+
; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP29]]
70+
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP30]]
71+
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP31]]
72+
; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP32]], align 4
73+
; CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP33]], align 4
74+
; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP34]], align 4
75+
; CHECK-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP35]], align 4
76+
; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP36]], i32 0
77+
; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i32 1
78+
; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i32 2
79+
; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP39]], i32 3
80+
; CHECK-NEXT: [[TMP44:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP43]]
81+
; CHECK-NEXT: [[TMP45:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP44]]
82+
; CHECK-NEXT: [[TMP46:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP45]]
83+
; CHECK-NEXT: [[TMP47:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP46]]
84+
; CHECK-NEXT: [[TMP48]] = fadd fast <4 x float> [[VEC_PHI1]], [[TMP47]]
85+
; CHECK-NEXT: [[TMP49:%.*]] = add <4 x i64> [[TMP76]], splat (i64 2)
86+
; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i64> [[TMP49]], i32 0
87+
; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i64> [[TMP49]], i32 1
88+
; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i64> [[TMP49]], i32 2
89+
; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i64> [[TMP49]], i32 3
90+
; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP50]]
91+
; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP51]]
92+
; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP52]]
93+
; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP53]]
94+
; CHECK-NEXT: [[TMP58:%.*]] = load float, ptr [[TMP54]], align 4
95+
; CHECK-NEXT: [[TMP59:%.*]] = load float, ptr [[TMP55]], align 4
96+
; CHECK-NEXT: [[TMP60:%.*]] = load float, ptr [[TMP56]], align 4
97+
; CHECK-NEXT: [[TMP61:%.*]] = load float, ptr [[TMP57]], align 4
98+
; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x float> poison, float [[TMP58]], i32 0
99+
; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x float> [[TMP62]], float [[TMP59]], i32 1
100+
; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x float> [[TMP63]], float [[TMP60]], i32 2
101+
; CHECK-NEXT: [[TMP65:%.*]] = insertelement <4 x float> [[TMP64]], float [[TMP61]], i32 3
102+
; CHECK-NEXT: [[TMP66:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP65]]
103+
; CHECK-NEXT: [[TMP67:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP66]]
104+
; CHECK-NEXT: [[TMP68:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP67]]
105+
; CHECK-NEXT: [[TMP69:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP68]]
106+
; CHECK-NEXT: [[TMP70]] = fadd fast <4 x float> [[VEC_PHI2]], [[TMP69]]
107+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
108+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
109+
; CHECK-NEXT: [[TMP71:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
110+
; CHECK-NEXT: br i1 [[TMP71]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
111+
; CHECK: [[MIDDLE_BLOCK]]:
112+
; CHECK-NEXT: [[TMP72:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP26]])
113+
; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP48]])
114+
; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP70]])
115+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]]
116+
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
117+
; CHECK: [[SCALAR_PH]]:
118+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
119+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP72]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
120+
; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi float [ [[TMP73]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
121+
; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi float [ [[TMP74]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
20122
; CHECK-NEXT: br label %[[LOOP:.*]]
21123
; CHECK: [[LOOP]]:
22-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
23-
; CHECK-NEXT: [[RDX_0:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_0_NEXT:%.*]], %[[LOOP]] ]
24-
; CHECK-NEXT: [[RDX_1:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
25-
; CHECK-NEXT: [[RED_2:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
124+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
125+
; CHECK-NEXT: [[RDX_0:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RDX_0_NEXT:%.*]], %[[LOOP]] ]
126+
; CHECK-NEXT: [[RDX_1:%.*]] = phi float [ [[BC_MERGE_RDX6]], %[[SCALAR_PH]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
127+
; CHECK-NEXT: [[RED_2:%.*]] = phi float [ [[BC_MERGE_RDX7]], %[[SCALAR_PH]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
26128
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[IV]], [[OFFSET]]
27129
; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ADD]], 3
28130
; CHECK-NEXT: [[GEP_SRC_DATA:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[MUL]]
@@ -58,11 +160,11 @@ define float @_Z4testmm(i64 %size, i64 %offset) {
58160
; CHECK-NEXT: [[RDX_2_NEXT]] = fadd fast float [[RED_2]], [[MUL29]]
59161
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
60162
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]]
61-
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT:.*]]
163+
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
62164
; CHECK: [[EXIT]]:
63-
; CHECK-NEXT: [[RDX_0_NEXT_LCSSA:%.*]] = phi float [ [[RDX_0_NEXT]], %[[LOOP]] ]
64-
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi float [ [[RDX_1_NEXT]], %[[LOOP]] ]
65-
; CHECK-NEXT: [[RDX_2_NEXT_LCSSA:%.*]] = phi float [ [[RDX_2_NEXT]], %[[LOOP]] ]
165+
; CHECK-NEXT: [[RDX_0_NEXT_LCSSA:%.*]] = phi float [ [[RDX_0_NEXT]], %[[LOOP]] ], [ [[TMP72]], %[[MIDDLE_BLOCK]] ]
166+
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi float [ [[RDX_1_NEXT]], %[[LOOP]] ], [ [[TMP73]], %[[MIDDLE_BLOCK]] ]
167+
; CHECK-NEXT: [[RDX_2_NEXT_LCSSA:%.*]] = phi float [ [[RDX_2_NEXT]], %[[LOOP]] ], [ [[TMP74]], %[[MIDDLE_BLOCK]] ]
66168
; CHECK-NEXT: [[RES_0:%.*]] = fadd float [[RDX_0_NEXT_LCSSA]], [[RDX_1_NEXT_LCSSA]]
67169
; CHECK-NEXT: [[RES_1:%.*]] = fadd float [[RES_0]], [[RDX_2_NEXT_LCSSA]]
68170
; CHECK-NEXT: ret float [[RES_1]]
@@ -117,3 +219,9 @@ exit:
117219
%res.1 = fadd float %res.0, %rdx.2.next
118220
ret float %res.1
119221
}
222+
;.
223+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
224+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
225+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
226+
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
227+
;.

llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,17 @@ define void @test() {
1717
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
1818
; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
1919
; SSE2: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
20-
; SSE2: LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
21-
; SSE2: LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
20+
; SSE2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
21+
; SSE2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
2222
;
2323
; AVX1-LABEL: 'test'
2424
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
2525
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
2626
; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
2727
; AVX1: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
28-
; AVX1: LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
29-
; AVX1: LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
30-
; AVX1: LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
28+
; AVX1: LV: Found an estimated cost of 15 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
29+
; AVX1: LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
30+
; AVX1: LV: Found an estimated cost of 60 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
3131
;
3232
; AVX2-LABEL: 'test'
3333
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4

0 commit comments

Comments
 (0)