llvm
diff --git a/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp‎
Lines changed: 3 additions & 13 deletions b/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp‎
Lines changed: 3 additions & 13 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp‎
Lines changed: 5 additions & 8 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanUtils.cpp‎
Lines changed: 19 additions & 0 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanUtils.cpp‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanUtils.h‎
Lines changed: 6 additions & 0 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanUtils.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll‎
Lines changed: 214 additions & 21 deletions b/‎llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll‎
Lines changed: 214 additions & 21 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll‎
Lines changed: 119 additions & 11 deletions b/‎llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll‎
Lines changed: 119 additions & 11 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll‎
Lines changed: 5 additions & 5 deletions b/‎llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll‎
Lines changed: 5 additions & 5 deletions
@@ -5196,19 +5196,9 @@ static const SCEV *getAddressAccessSCEV(
   if (!Gep)
     return nullptr;
 
-  // We are looking for a gep with all loop invariant indices except for one
-  // which should be an induction variable.
-  auto *SE = PSE.getSE();
-  unsigned NumOperands = Gep->getNumOperands();
-  for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
-    Value *Opd = Gep->getOperand(Idx);
-    if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
-        !Legal->isInductionVariable(Opd))
-      return nullptr;
-  }
-
-  // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
-  return PSE.getSCEV(Ptr);
+  const SCEV *Addr = PSE.getSCEV(Ptr);
+  return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), TheLoop) ? Addr
+                                                                    : nullptr;
 }
 
 InstructionCost
 
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
@@ -3123,15 +3124,11 @@ static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
                  match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
     return nullptr;
 
-  // We are looking for a GEP where all indices are either loop invariant or
-  // inductions.
-  for (VPValue *Opd : drop_begin(PtrR->operands())) {
-    if (!Opd->isDefinedOutsideLoopRegions() &&
-        !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
-      return nullptr;
-  }
+  const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, SE, L);
+  if (isa<SCEVCouldNotCompute>(Addr))
+    return Addr;
 
-  return vputils::getSCEVExprForVPValue(Ptr, SE, L);
+  return vputils::isAddressSCEVForCost(Addr, SE, L) ? Addr : nullptr;
 }
 
 /// Returns true if \p V is used as part of the address of another load or
 
@@ -13,9 +13,11 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
 
 using namespace llvm;
 using namespace llvm::VPlanPatternMatch;
+using namespace llvm::SCEVPatternMatch;
 
 bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
   return all_of(Def->users(),
@@ -150,6 +152,23 @@ const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
       .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });
 }
 
+bool vputils::isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE,
+                                   const Loop *L) {
+  // If address is an SCEVAddExpr, all operands must be either be invariant or a
+  // (possibly sign-extend) affine AddRec.
+  if (auto *PtrAdd = dyn_cast<SCEVAddExpr>(Addr)) {
+    return all_of(PtrAdd->operands(), [&SE, L](const SCEV *Op) {
+      return SE.isLoopInvariant(Op, L) ||
+             match(Op, m_scev_SExt(m_scev_AffineAddRec(m_SCEV(), m_SCEV()))) ||
+             match(Op, m_scev_AffineAddRec(m_SCEV(), m_SCEV()));
+    });
+  }
+
+  // Otherwise, check if address is loop invariant or an affine add recurrence.
+  return SE.isLoopInvariant(Addr, L) ||
+         match(Addr, m_scev_AffineAddRec(m_SCEV(), m_SCEV()));
+}
+
 /// Returns true if \p Opcode preserves uniformity, i.e., if all operands are
 /// uniform, the result will also be uniform.
 static bool preservesUniformity(unsigned Opcode) {
 
@@ -42,6 +42,12 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr);
 const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE,
                                   const Loop *L = nullptr);
 
+/// Returns true if \p Addr is an address SCEV that can be passed to
+/// TTI::getAddressComputationCost, i.e. the address SCEV is loop invariant, an
+/// affine AddRec (i.e. induction ), or an add expression of such operands or a
+/// sign-extended AddRec.
+bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L);
+
 /// Returns true if \p VPV is a single scalar, either because it produces the
 /// same value for all lanes or only has its first lane used.
 bool isSingleScalar(const VPValue *VPV);
 
@@ -9,20 +9,122 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
 @kernel4 = global [512 x float] zeroinitializer, align 4
 @src_data = global [1536 x float] zeroinitializer, align 4
 
-; We don't want to vectorize most loops containing gathers because they are
-; expensive.
-; Make sure we don't vectorize it.
+; The cost of gathers in the loop gets offset by the vector math.
 
 define float @_Z4testmm(i64 %size, i64 %offset) {
 ; CHECK-LABEL: define float @_Z4testmm(
 ; CHECK-SAME: i64 [[SIZE:%.*]], i64 [[OFFSET:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SIZE]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SIZE]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[OFFSET]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP70:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP75:%.*]] = add <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP76:%.*]] = mul <4 x i64> [[TMP75]], splat (i64 3)
+; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <4 x i64> [[TMP76]], i32 0
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i64> [[TMP76]], i32 1
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i64> [[TMP76]], i32 2
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i64> [[TMP76]], i32 3
+; CHECK-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP77]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP78]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP79]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP80]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP81]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP12]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP13]], i32 3
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast <4 x float> [[TMP17]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul fast <4 x float> [[TMP19]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul fast <4 x float> [[TMP21]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = fmul fast <4 x float> [[TMP23]], [[WIDE_LOAD5]]
+; CHECK-NEXT:    [[TMP26]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add <4 x i64> [[TMP76]], splat (i64 1)
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP27]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP27]], i32 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP27]], i32 2
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP27]], i32 3
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP28]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP31]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load float, ptr [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP33]], align 4
+; CHECK-NEXT:    [[TMP38:%.*]] = load float, ptr [[TMP34]], align 4
+; CHECK-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP35]], align 4
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP36]], i32 0
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i32 1
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i32 2
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP39]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP46]]
+; CHECK-NEXT:    [[TMP48]] = fadd fast <4 x float> [[VEC_PHI1]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = add <4 x i64> [[TMP76]], splat (i64 2)
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i64> [[TMP49]], i32 0
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i64> [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i64> [[TMP49]], i32 2
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i64> [[TMP49]], i32 3
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP50]]
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP51]]
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP52]]
+; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP53]]
+; CHECK-NEXT:    [[TMP58:%.*]] = load float, ptr [[TMP54]], align 4
+; CHECK-NEXT:    [[TMP59:%.*]] = load float, ptr [[TMP55]], align 4
+; CHECK-NEXT:    [[TMP60:%.*]] = load float, ptr [[TMP56]], align 4
+; CHECK-NEXT:    [[TMP61:%.*]] = load float, ptr [[TMP57]], align 4
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x float> poison, float [[TMP58]], i32 0
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x float> [[TMP62]], float [[TMP59]], i32 1
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x float> [[TMP63]], float [[TMP60]], i32 2
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <4 x float> [[TMP64]], float [[TMP61]], i32 3
+; CHECK-NEXT:    [[TMP66:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP66]]
+; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP67]]
+; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP68]]
+; CHECK-NEXT:    [[TMP70]] = fadd fast <4 x float> [[VEC_PHI2]], [[TMP69]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP71]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP72:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP26]])
+; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP48]])
+; CHECK-NEXT:    [[TMP74:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP70]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP72]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX6:%.*]] = phi float [ [[TMP73]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX7:%.*]] = phi float [ [[TMP74]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX_0:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_0_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX_1:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED_2:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_0:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RDX_0_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1:%.*]] = phi float [ [[BC_MERGE_RDX6]], %[[SCALAR_PH]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED_2:%.*]] = phi float [ [[BC_MERGE_RDX7]], %[[SCALAR_PH]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[IV]], [[OFFSET]]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[ADD]], 3
 ; CHECK-NEXT:    [[GEP_SRC_DATA:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[MUL]]
@@ -58,11 +160,11 @@ define float @_Z4testmm(i64 %size, i64 %offset) {
 ; CHECK-NEXT:    [[RDX_2_NEXT]] = fadd fast float [[RED_2]], [[MUL29]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RDX_0_NEXT_LCSSA:%.*]] = phi float [ [[RDX_0_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX_1_NEXT_LCSSA:%.*]] = phi float [ [[RDX_1_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX_2_NEXT_LCSSA:%.*]] = phi float [ [[RDX_2_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_0_NEXT_LCSSA:%.*]] = phi float [ [[RDX_0_NEXT]], %[[LOOP]] ], [ [[TMP72]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RDX_1_NEXT_LCSSA:%.*]] = phi float [ [[RDX_1_NEXT]], %[[LOOP]] ], [ [[TMP73]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RDX_2_NEXT_LCSSA:%.*]] = phi float [ [[RDX_2_NEXT]], %[[LOOP]] ], [ [[TMP74]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES_0:%.*]] = fadd float [[RDX_0_NEXT_LCSSA]], [[RDX_1_NEXT_LCSSA]]
 ; CHECK-NEXT:    [[RES_1:%.*]] = fadd float [[RES_0]], [[RDX_2_NEXT_LCSSA]]
 ; CHECK-NEXT:    ret float [[RES_1]]
@@ -117,3 +219,9 @@ exit:
   %res.1 = fadd float %res.0, %rdx.2.next
   ret float %res.1
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
@@ -17,17 +17,17 @@ define void @test() {
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
 ; SSE2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
 ; SSE2:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
 ; AVX1:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 15 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4