From 6b914241635f8d3804c0837ca30621051526c6b6 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 5 Oct 2025 19:49:54 +0100 Subject: [PATCH] Reapply "[VPlan] Compute cost of more replicating loads/stores in ::computeCost. (#160053)" This reverts commit f80c0baf058dbdc5 and 94eade61a02ae5. Recommit a small fix for targets using prefersVectorizedAddressing. Original message: Update VPReplicateRecipe::computeCost to compute costs of more replicating loads/stores. There are 2 cases that require extra checks to match the legacy cost model: 1. If the pointer is based on an induction, the legacy cost model passes its SCEV to getAddressComputationCost. In those cases, still fall back to the legacy cost. SCEV computations will be added as follow-up 2. If a load is used as part of an address of another load, the legacy cost model skips the scalarization overhead. Those cases are currently handled by a usedByLoadOrStore helper. Note that getScalarizationOverhead also needs updating, because when the legacy cost model computes the scalarization overhead, scalars have not been collected yet, so we can't each for replicating recipes to skip their cost, except other loads. This again can be further improved by modeling inserts/extracts explicitly and consistently, and compute costs for those operations directly where needed. PR: https://github.com/llvm/llvm-project/pull/160053 --- .../Transforms/Vectorize/LoopVectorize.cpp | 16 ++- llvm/lib/Transforms/Vectorize/VPlan.cpp | 9 +- llvm/lib/Transforms/Vectorize/VPlanHelpers.h | 16 ++- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 124 ++++++++++++++--- .../ARM/replicating-load-store-costs.ll | 84 ++++++++++++ .../X86/replicating-load-store-costs.ll | 126 ++++++++++++++++++ 6 files changed, 346 insertions(+), 29 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/replicating-load-store-costs.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d393a9c1506d6..cee08ef94aeb5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3903,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( if (VF.isScalar()) continue; - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { @@ -4160,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, + *CM.PSE.getSE()); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( @@ -6852,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7085,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The @@ -8418,7 +8421,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -9874,7 +9878,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind); + CM.CostKind, *CM.PSE.getSE()); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 07b191a787806..2555ebe2ad897 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1772,7 +1772,8 @@ VPCostContext::getOperandInfo(VPValue *V) const { } InstructionCost VPCostContext::getScalarizationOverhead( - Type *ResultTy, ArrayRef Operands, ElementCount VF) { + Type *ResultTy, ArrayRef Operands, ElementCount VF, + bool AlwaysIncludeReplicatingR) { if (VF.isScalar()) return 0; @@ -1792,7 +1793,11 @@ InstructionCost VPCostContext::getScalarizationOverhead( SmallPtrSet UniqueOperands; SmallVector Tys; for (auto *Op : Operands) { - if (Op->isLiveIn() || isa(Op) || + if (Op->isLiveIn() || + (!AlwaysIncludeReplicatingR && + isa(Op)) || + (isa(Op) && + cast(Op)->getOpcode() == Instruction::Load) || !UniqueOperands.insert(Op).second) continue; Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index fc1a09e9850f6..1580a3be3180a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -349,12 +349,14 @@ struct VPCostContext { LoopVectorizationCostModel &CM; SmallPtrSet SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; + ScalarEvolution &SE; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, - TargetTransformInfo::TargetCostKind CostKind) + TargetTransformInfo::TargetCostKind CostKind, + ScalarEvolution &SE) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind) {} + CostKind(CostKind), SE(SE) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. @@ -374,10 +376,12 @@ struct VPCostContext { /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy /// and \p Operands with \p VF. This is a convenience wrapper for the - /// type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Type *ResultTy, - ArrayRef Operands, - ElementCount VF); + /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR + /// is true, always compute the cost of scalarizing replicating operands. + InstructionCost + getScalarizationOverhead(Type *ResultTy, ArrayRef Operands, + ElementCount VF, + bool AlwaysIncludeReplicatingR = false); }; /// This class can be used to assign names to VPValues. For VPValues without diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 67b9244e9dc72..60b08a31dd0c0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -40,6 +40,7 @@ #include using namespace llvm; +using namespace llvm::VPlanPatternMatch; using VectorParts = SmallVector; @@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, VPRecipeBase *OpR = Op->getDefiningRecipe(); // If the partial reduction is predicated, a select will be operand 0 - using namespace llvm::VPlanPatternMatch; if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) { OpR = Op->getDefiningRecipe(); } @@ -1963,7 +1963,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); VPValue *Op0, *Op1; - using namespace llvm::VPlanPatternMatch; if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { @@ -3111,6 +3110,62 @@ bool VPReplicateRecipe::shouldPack() const { }); } +/// Returns true if \p Ptr is a pointer computation for which the legacy cost +/// model computes a SCEV expression when computing the address cost. +static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { + auto *PtrR = Ptr->getDefiningRecipe(); + if (!PtrR || !((isa(PtrR) && + cast(PtrR)->getOpcode() == + Instruction::GetElementPtr) || + isa(PtrR) || + match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) + return false; + + // We are looking for a GEP where all indices are either loop invariant or + // inductions. + for (VPValue *Opd : drop_begin(PtrR->operands())) { + if (!Opd->isDefinedOutsideLoopRegions() && + !isa(Opd)) + return false; + } + + return true; +} + +/// Returns true if \p V is used as part of the address of another load or +/// store. +static bool isUsedByLoadStoreAddress(const VPUser *V) { + SmallPtrSet Seen; + SmallVector WorkList = {V}; + + while (!WorkList.empty()) { + auto *Cur = dyn_cast(WorkList.pop_back_val()); + if (!Cur || !Seen.insert(Cur).second) + continue; + + for (VPUser *U : Cur->users()) { + if (auto *InterleaveR = dyn_cast(U)) + if (InterleaveR->getAddr() == Cur) + return true; + if (auto *RepR = dyn_cast(U)) { + if (RepR->getOpcode() == Instruction::Load && + RepR->getOperand(0) == Cur) + return true; + if (RepR->getOpcode() == Instruction::Store && + RepR->getOperand(1) == Cur) + return true; + } + if (auto *MemR = dyn_cast(U)) { + if (MemR->getAddr() == Cur && MemR->isConsecutive()) + return true; + } + } + + append_range(WorkList, cast(Cur)->users()); + } + return false; +} + InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *UI = cast(getUnderlyingValue()); @@ -3218,21 +3273,60 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, } case Instruction::Load: case Instruction::Store: { - if (isSingleScalar()) { - bool IsLoad = UI->getOpcode() == Instruction::Load; - Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); - Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); - const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = getLoadStoreAddressSpace(UI); - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); - InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( - UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); - return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( - ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); - } + if (VF.isScalable() && !isSingleScalar()) + return InstructionCost::getInvalid(); + // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - break; + const VPRegionBlock *ParentRegion = getParent()->getParent(); + if (ParentRegion && ParentRegion->isReplicator()) + break; + + bool IsLoad = UI->getOpcode() == Instruction::Load; + const VPValue *PtrOp = getOperand(!IsLoad); + // TODO: Handle cases where we need to pass a SCEV to + // getAddressComputationCost. + if (shouldUseAddressAccessSCEV(PtrOp)) + break; + + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); + + Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); + bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); + bool UsedByLoadStoreAddress = + !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this); + InstructionCost ScalarCost = + ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, + nullptr, Ctx.CostKind); + if (isSingleScalar()) + return ScalarCost; + + SmallVector OpsToScalarize; + Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); + // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we + // don't assign scalarization overhead in general, if the target prefers + // vectorized addressing or the loaded value is used as part of an address + // of another load or store. + if (!UsedByLoadStoreAddress) { + bool EfficientVectorLoadStore = + Ctx.TTI.supportsEfficientVectorElementLoadStore(); + if (!(IsLoad && !PreferVectorizedAddressing) && + !(!IsLoad && EfficientVectorLoadStore)) + append_range(OpsToScalarize, operands()); + + if (!EfficientVectorLoadStore) + ResultTy = Ctx.Types.inferScalarType(this); + } + + return (ScalarCost * VF.getFixedValue()) + + Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); } } diff --git a/llvm/test/Transforms/LoopVectorize/ARM/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/ARM/replicating-load-store-costs.ll new file mode 100644 index 0000000000000..fd83a012541b5 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/replicating-load-store-costs.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -p loop-vectorize -S %s | FileCheck %s + +target triple = "armv7-unknown-linux-gnueabihf" + +define void @replicating_load_used_by_other_load(i32 %arg, ptr %a, i32 %b) { +; CHECK-LABEL: define void @replicating_load_used_by_other_load( +; CHECK-SAME: i32 [[ARG:%.*]], ptr [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[ARG]], %[[ENTRY]] ] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[IV]], 1 +; CHECK-NEXT: [[AND_1:%.*]] = and i32 [[IV]], 1 +; CHECK-NEXT: [[SHL_1:%.*]] = shl i32 [[IV]], 2 +; CHECK-NEXT: [[SHL_2:%.*]] = shl i32 [[IV]], 1 +; CHECK-NEXT: [[AND_2:%.*]] = and i32 [[SHL_2]], 2 +; CHECK-NEXT: [[OR_1:%.*]] = or i32 [[AND_2]], [[AND_1]] +; CHECK-NEXT: [[OR_2:%.*]] = or i32 [[OR_1]], [[SHL_1]] +; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[B]], [[OR_2]] +; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[XOR_1]], [[ARG]] +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[SHL_1]], 1 +; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[SHR]], [[ARG]] +; CHECK-NEXT: [[AND_3:%.*]] = and i32 [[XOR_3]], 1 +; CHECK-NEXT: [[AND_4:%.*]] = and i32 [[IV]], 2147483646 +; CHECK-NEXT: [[OR_3:%.*]] = or i32 [[AND_3]], [[AND_4]] +; CHECK-NEXT: [[AND_5:%.*]] = and i32 [[IV]], 254 +; CHECK-NEXT: [[SHL_3:%.*]] = shl i32 [[OR_3]], 1 +; CHECK-NEXT: [[XOR_4:%.*]] = xor i32 [[SHL_3]], 2 +; CHECK-NEXT: [[OR_4:%.*]] = or i32 [[AND_5]], [[XOR_4]] +; CHECK-NEXT: [[XOR_5:%.*]] = xor i32 [[SHR_2]], [[OR_4]] +; CHECK-NEXT: [[XOR_6:%.*]] = xor i32 [[XOR_5]], [[XOR_2]] +; CHECK-NEXT: [[AND_6:%.*]] = and i32 [[XOR_6]], 255 +; CHECK-NEXT: [[XOR_7:%.*]] = xor i32 [[AND_6]], 1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[XOR_7]] +; CHECK-NEXT: [[LD:%.*]] = load i8, ptr [[GEP]], align 1 +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[LD]] to i32 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i32, ptr null, i32 [[ZEXT]] +; CHECK-NEXT: store i32 0, ptr [[GEP_2]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 100 +; CHECK-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ %arg, %entry ] + %shr = lshr i32 %iv, 1 + %and.1 = and i32 %iv, 1 + %shl.1 = shl i32 %iv, 2 + %shl.2 = shl i32 %iv, 1 + %and.2 = and i32 %shl.2, 2 + %or.1 = or i32 %and.2, %and.1 + %or.2 = or i32 %or.1, %shl.1 + %xor.1 = xor i32 %b, %or.2 + %xor.2 = xor i32 %xor.1, %arg + %shr.2 = lshr i32 %shl.1, 1 + %xor.3 = xor i32 %shr, %arg + %and.3 = and i32 %xor.3, 1 + %and.4 = and i32 %iv, 2147483646 + %or.3 = or i32 %and.3, %and.4 + %and.5 = and i32 %iv, 254 + %shl.3 = shl i32 %or.3, 1 + %xor.4 = xor i32 %shl.3, 2 + %or.4 = or i32 %and.5, %xor.4 + %xor.5 = xor i32 %shr.2, %or.4 + %xor.6 = xor i32 %xor.5, %xor.2 + %and.6 = and i32 %xor.6, 255 + %xor.7 = xor i32 %and.6, 1 + %gep = getelementptr i8, ptr %a, i32 %xor.7 + %ld = load i8, ptr %gep, align 1 + %zext = zext i8 %ld to i32 + %gep.2 = getelementptr i32, ptr null, i32 %zext + store i32 0, ptr %gep.2, align 4 + %iv.next = add i32 %iv, 1 + %cmp = icmp eq i32 %iv.next, 100 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll index 87848730c8f01..f5329cf86451b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll @@ -454,6 +454,132 @@ exit: ret void } +declare i1 @cond() + +define double @test_load_used_by_other_load_scev(ptr %ptr.a, ptr %ptr.b, ptr %ptr.c) { +; I64-LABEL: define double @test_load_used_by_other_load_scev( +; I64-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) { +; I64-NEXT: [[ENTRY:.*]]: +; I64-NEXT: br label %[[OUTER_LOOP:.*]] +; I64: [[OUTER_LOOP_LOOPEXIT:.*]]: +; I64-NEXT: br label %[[OUTER_LOOP]] +; I64: [[OUTER_LOOP]]: +; I64-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29:%.*]], %[[OUTER_LOOP_LOOPEXIT]] ] +; I64-NEXT: [[COND:%.*]] = call i1 @cond() +; I64-NEXT: br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[EXIT:.*]] +; I64: [[INNER_LOOP_PREHEADER]]: +; I64-NEXT: br label %[[VECTOR_PH:.*]] +; I64: [[VECTOR_PH]]: +; I64-NEXT: br label %[[VECTOR_BODY:.*]] +; I64: [[VECTOR_BODY]]: +; I64-NEXT: [[TMP0:%.*]] = add i64 0, 1 +; I64-NEXT: [[TMP1:%.*]] = add i64 1, 1 +; I64-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP0]] +; I64-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP1]] +; I64-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP0]] +; I64-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP1]] +; I64-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 +; I64-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 +; I64-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP6]] +; I64-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP7]] +; I64-NEXT: [[TMP10:%.*]] = load double, ptr [[PTR_A]], align 8 +; I64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0 +; I64-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer +; I64-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], zeroinitializer +; I64-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP2]], i64 8 +; I64-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8 +; I64-NEXT: [[TMP14:%.*]] = load double, ptr [[TMP12]], align 8 +; I64-NEXT: [[TMP15:%.*]] = load double, ptr [[TMP13]], align 8 +; I64-NEXT: [[TMP16:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i32 0 +; I64-NEXT: [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP15]], i32 1 +; I64-NEXT: [[TMP18:%.*]] = fmul <2 x double> [[TMP11]], zeroinitializer +; I64-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[ACCUM]], i64 0 +; I64-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer +; I64-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT2]], <2 x double> [[TMP18]], <2 x i32> +; I64-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP17]], zeroinitializer +; I64-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], zeroinitializer +; I64-NEXT: [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], splat (double 1.000000e+00) +; I64-NEXT: [[TMP23:%.*]] = load double, ptr [[TMP8]], align 8 +; I64-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP9]], align 8 +; I64-NEXT: [[TMP25:%.*]] = insertelement <2 x double> poison, double [[TMP23]], i32 0 +; I64-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[TMP24]], i32 1 +; I64-NEXT: [[TMP27:%.*]] = fdiv <2 x double> [[TMP26]], [[TMP22]] +; I64-NEXT: [[TMP28:%.*]] = fsub <2 x double> [[TMP19]], [[TMP27]] +; I64-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; I64: [[MIDDLE_BLOCK]]: +; I64-NEXT: [[TMP29]] = extractelement <2 x double> [[TMP28]], i32 1 +; I64-NEXT: br label %[[OUTER_LOOP_LOOPEXIT]] +; I64: [[EXIT]]: +; I64-NEXT: ret double [[ACCUM]] +; +; I32-LABEL: define double @test_load_used_by_other_load_scev( +; I32-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) { +; I32-NEXT: [[ENTRY:.*]]: +; I32-NEXT: br label %[[OUTER_LOOP:.*]] +; I32: [[OUTER_LOOP]]: +; I32-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT:%.*]], %[[INNER_LOOP:.*]] ] +; I32-NEXT: [[COND:%.*]] = call i1 @cond() +; I32-NEXT: br i1 [[COND]], label %[[INNER_LOOP]], label %[[EXIT:.*]] +; I32: [[INNER_LOOP]]: +; I32-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[OUTER_LOOP]] ], [ [[IV_NEXT:%.*]], %[[INNER_LOOP]] ] +; I32-NEXT: [[ACCUM_INNER:%.*]] = phi double [ [[ACCUM]], %[[OUTER_LOOP]] ], [ [[MUL1:%.*]], %[[INNER_LOOP]] ] +; I32-NEXT: [[IDX_PLUS1:%.*]] = add i64 [[IV]], 1 +; I32-NEXT: [[GEP_C:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[IDX_PLUS1]] +; I32-NEXT: [[GEP_A_I64:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[IDX_PLUS1]] +; I32-NEXT: [[LOAD_IDX:%.*]] = load i64, ptr [[GEP_A_I64]], align 8 +; I32-NEXT: [[GEP_B:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[LOAD_IDX]] +; I32-NEXT: [[LOAD_A:%.*]] = load double, ptr [[PTR_A]], align 8 +; I32-NEXT: [[ADD1:%.*]] = fadd double [[LOAD_A]], 0.000000e+00 +; I32-NEXT: [[GEP_C_OFFSET:%.*]] = getelementptr i8, ptr [[GEP_C]], i64 8 +; I32-NEXT: [[LOAD_C:%.*]] = load double, ptr [[GEP_C_OFFSET]], align 8 +; I32-NEXT: [[MUL1]] = fmul double [[ADD1]], 0.000000e+00 +; I32-NEXT: [[MUL2:%.*]] = fmul double [[LOAD_C]], 0.000000e+00 +; I32-NEXT: [[ADD2:%.*]] = fadd double [[MUL2]], 0.000000e+00 +; I32-NEXT: [[ADD3:%.*]] = fadd double [[ADD2]], 1.000000e+00 +; I32-NEXT: [[LOAD_B:%.*]] = load double, ptr [[GEP_B]], align 8 +; I32-NEXT: [[DIV:%.*]] = fdiv double [[LOAD_B]], [[ADD3]] +; I32-NEXT: [[RESULT]] = fsub double [[ACCUM_INNER]], [[DIV]] +; I32-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; I32-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1 +; I32-NEXT: br i1 [[EXITCOND]], label %[[OUTER_LOOP]], label %[[INNER_LOOP]] +; I32: [[EXIT]]: +; I32-NEXT: ret double [[ACCUM]] +; +entry: + br label %outer.loop + +outer.loop: + %accum = phi double [ 0.0, %entry ], [ %result, %inner.loop ] + %cond = call i1 @cond() + br i1 %cond, label %inner.loop, label %exit + +inner.loop: + %iv = phi i64 [ 0, %outer.loop ], [ %iv.next, %inner.loop ] + %accum.inner = phi double [ %accum, %outer.loop ], [ %mul1, %inner.loop ] + %idx.plus1 = add i64 %iv, 1 + %gep.c = getelementptr i8, ptr %ptr.c, i64 %idx.plus1 + %gep.a.i64 = getelementptr i64, ptr %ptr.a, i64 %idx.plus1 + %load.idx = load i64, ptr %gep.a.i64, align 8 + %gep.b = getelementptr double, ptr %ptr.b, i64 %load.idx + %load.a = load double, ptr %ptr.a, align 8 + %add1 = fadd double %load.a, 0.000000e+00 + %gep.c.offset = getelementptr i8, ptr %gep.c, i64 8 + %load.c = load double, ptr %gep.c.offset, align 8 + %mul1 = fmul double %add1, 0.000000e+00 + %mul2 = fmul double %load.c, 0.000000e+00 + %add2 = fadd double %mul2, 0.000000e+00 + %add3 = fadd double %add2, 1.000000e+00 + %load.b = load double, ptr %gep.b, align 8 + %div = fdiv double %load.b, %add3 + %result = fsub double %accum.inner, %div + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv, 1 + br i1 %exitcond, label %outer.loop, label %inner.loop + +exit: + ret double %accum +} + attributes #0 = { "target-cpu"="znver2" } !0 = distinct !{!0, !1}