@@ -1253,7 +1253,7 @@ class BoUpSLP {
12531253 NonScheduledFirst.clear();
12541254 EntryToLastInstruction.clear();
12551255 ExternalUses.clear();
1256- ExternalUsesAsGEPs .clear();
1256+ ExternalUsesAsOriginalScalar .clear();
12571257 for (auto &Iter : BlocksSchedules) {
12581258 BlockScheduling *BS = Iter.second.get();
12591259 BS->clear();
@@ -3468,7 +3468,7 @@ class BoUpSLP {
34683468
34693469 /// A list of GEPs which can be reaplced by scalar GEPs instead of
34703470 /// extractelement instructions.
3471- SmallPtrSet<Value *, 4> ExternalUsesAsGEPs ;
3471+ SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar ;
34723472
34733473 /// Values used only by @llvm.assume calls.
34743474 SmallPtrSet<const Value *, 32> EphValues;
@@ -10663,6 +10663,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1066310663 SmallDenseSet<Value *, 4> UsedInserts;
1066410664 DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
1066510665 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10666+ DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
1066610667 for (ExternalUser &EU : ExternalUses) {
1066710668 // We only add extract cost once for the same scalar.
1066810669 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -10771,52 +10772,90 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1077110772 }
1077210773 }
1077310774 }
10774- // Leave the GEPs as is, they are free in most cases and better to keep them
10775- // as GEPs.
10775+
1077610776 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10777- if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10777+ // If we plan to rewrite the tree in a smaller type, we will need to sign
10778+ // extend the extracted value back to the original type. Here, we account
10779+ // for the extract and the added cost of the sign extend if needed.
10780+ InstructionCost ExtraCost = TTI::TCC_Free;
10781+ auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10782+ const TreeEntry *Entry = getTreeEntry(EU.Scalar);
10783+ auto It = MinBWs.find(Entry);
10784+ if (It != MinBWs.end()) {
10785+ auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10786+ unsigned Extend =
10787+ It->second.second ? Instruction::SExt : Instruction::ZExt;
10788+ VecTy = getWidenedType(MinTy, BundleWidth);
10789+ ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10790+ VecTy, EU.Lane);
10791+ } else {
10792+ ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10793+ CostKind, EU.Lane);
10794+ }
10795+ // Leave the scalar instructions as is if they are cheaper than extracts.
10796+ if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
10797+ Entry->getOpcode() == Instruction::Load) {
1077810798 if (!ValueToExtUses) {
1077910799 ValueToExtUses.emplace();
1078010800 for_each(enumerate(ExternalUses), [&](const auto &P) {
10801+ // Ignore phis in loops.
10802+ if (auto *Phi = dyn_cast_if_present<PHINode>(P.value().User)) {
10803+ auto *I = cast<Instruction>(P.value().Scalar);
10804+ const Loop *L = LI->getLoopFor(Phi->getParent());
10805+ if (L && (Phi->getParent() == I->getParent() ||
10806+ L == LI->getLoopFor(I->getParent())))
10807+ return;
10808+ }
10809+
1078110810 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
1078210811 });
1078310812 }
10784- // Can use original GEP, if no operands vectorized or they are marked as
10785- // externally used already.
10786- bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10787- if (!getTreeEntry(V))
10788- return true;
10789- auto It = ValueToExtUses->find(V);
10790- if (It != ValueToExtUses->end()) {
10791- // Replace all uses to avoid compiler crash.
10792- ExternalUses[It->second].User = nullptr;
10813+ // Can use original instruction, if no operands vectorized or they are
10814+ // marked as externally used already.
10815+ auto *Inst = cast<Instruction>(EU.Scalar);
10816+ bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
10817+ if (!getTreeEntry(V)) {
10818+ // Some extractelements might be not vectorized, but
10819+ // transformed into shuffle and removed from the function,
10820+ // consider it here.
10821+ if (auto *EE = dyn_cast<ExtractElementInst>(V))
10822+ return !EE->hasOneUse() || !MustGather.contains(EE);
1079310823 return true;
1079410824 }
10795- return false ;
10825+ return ValueToExtUses->contains(V) ;
1079610826 });
10797- if (CanBeUsedAsGEP) {
10798- ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10799- ExternalUsesAsGEPs.insert(EU.Scalar);
10800- continue;
10827+ if (CanBeUsedAsScalar) {
10828+ InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
10829+ bool KeepScalar = ScalarCost <= ExtraCost;
10830+ if (KeepScalar && ScalarCost != TTI::TCC_Free &&
10831+ ExtraCost - ScalarCost <= TTI::TCC_Basic) {
10832+ unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
10833+ return ValueToExtUses->contains(V);
10834+ });
10835+ auto It = ExtractsCount.find(Entry);
10836+ if (It != ExtractsCount.end())
10837+ ScalarUsesCount -= It->getSecond().size();
10838+ // Keep original scalar if number of externally used instructions in
10839+ // the same entry is not power of 2. It may help to do some extra
10840+ // vectorization for now.
10841+ KeepScalar = ScalarUsesCount <= 1 || !isPowerOf2_32(ScalarUsesCount);
10842+ }
10843+ if (KeepScalar) {
10844+ ExternalUsesAsOriginalScalar.insert(EU.Scalar);
10845+ for_each(Inst->operands(), [&](Value *V) {
10846+ auto It = ValueToExtUses->find(V);
10847+ if (It != ValueToExtUses->end()) {
10848+ // Replace all uses to avoid compiler crash.
10849+ ExternalUses[It->second].User = nullptr;
10850+ }
10851+ });
10852+ ExtraCost = ScalarCost;
10853+ ExtractsCount[Entry].insert(Inst);
10854+ }
1080110855 }
1080210856 }
1080310857
10804- // If we plan to rewrite the tree in a smaller type, we will need to sign
10805- // extend the extracted value back to the original type. Here, we account
10806- // for the extract and the added cost of the sign extend if needed.
10807- auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10808- auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10809- if (It != MinBWs.end()) {
10810- auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10811- unsigned Extend =
10812- It->second.second ? Instruction::SExt : Instruction::ZExt;
10813- VecTy = getWidenedType(MinTy, BundleWidth);
10814- ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10815- VecTy, EU.Lane);
10816- } else {
10817- ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10818- CostKind, EU.Lane);
10819- }
10858+ ExtractCost += ExtraCost;
1082010859 }
1082110860 // Add reduced value cost, if resized.
1082210861 if (!VectorizedVals.empty()) {
@@ -14067,8 +14106,7 @@ Value *BoUpSLP::vectorizeTree(
1406714106 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
1406814107 // Maps extract Scalar to the corresponding extractelement instruction in the
1406914108 // basic block. Only one extractelement per block should be emitted.
14070- DenseMap<Value *,
14071- DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
14109+ DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
1407214110 ScalarToEEs;
1407314111 SmallDenseSet<Value *, 4> UsedInserts;
1407414112 DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
@@ -14098,30 +14136,41 @@ Value *BoUpSLP::vectorizeTree(
1409814136 if (Scalar->getType() != Vec->getType()) {
1409914137 Value *Ex = nullptr;
1410014138 Value *ExV = nullptr;
14101- auto *GEP = dyn_cast<GetElementPtrInst >(Scalar);
14102- bool ReplaceGEP = GEP && ExternalUsesAsGEPs .contains(GEP );
14139+ auto *Inst = dyn_cast<Instruction >(Scalar);
14140+ bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar .contains(Inst );
1410314141 auto It = ScalarToEEs.find(Scalar);
1410414142 if (It != ScalarToEEs.end()) {
1410514143 // No need to emit many extracts, just move the only one in the
1410614144 // current block.
14107- auto EEIt = It->second.find(Builder.GetInsertBlock());
14145+ auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
14146+ : Builder.GetInsertBlock());
1410814147 if (EEIt != It->second.end()) {
14109- Instruction *I = EEIt->second.first;
14110- if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
14148+ Value *PrevV = EEIt->second.first;
14149+ if (auto *I = dyn_cast<Instruction>(PrevV);
14150+ I && !ReplaceInst &&
14151+ Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
1411114152 Builder.GetInsertPoint()->comesBefore(I)) {
1411214153 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
1411314154 Builder.GetInsertPoint());
14114- if (auto *CI = EEIt->second.second)
14155+ if (auto *CI = dyn_cast<Instruction>( EEIt->second.second) )
1411514156 CI->moveAfter(I);
1411614157 }
14117- Ex = I ;
14158+ Ex = PrevV ;
1411814159 ExV = EEIt->second.second ? EEIt->second.second : Ex;
1411914160 }
1412014161 }
1412114162 if (!Ex) {
1412214163 // "Reuse" the existing extract to improve final codegen.
14123- if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
14124- ES && isa<Instruction>(Vec)) {
14164+ if (ReplaceInst) {
14165+ // Leave the instruction as is, if it cheaper extracts and all
14166+ // operands are scalar.
14167+ auto *CloneInst = Inst->clone();
14168+ CloneInst->insertBefore(Inst);
14169+ if (Inst->hasName())
14170+ CloneInst->takeName(Inst);
14171+ Ex = CloneInst;
14172+ } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
14173+ ES && isa<Instruction>(Vec)) {
1412514174 Value *V = ES->getVectorOperand();
1412614175 auto *IVec = cast<Instruction>(Vec);
1412714176 if (const TreeEntry *ETE = getTreeEntry(V))
@@ -14132,18 +14181,6 @@ Value *BoUpSLP::vectorizeTree(
1413214181 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
1413314182 else
1413414183 Ex = Builder.CreateExtractElement(Vec, Lane);
14135- } else if (ReplaceGEP) {
14136- // Leave the GEPs as is, they are free in most cases and better to
14137- // keep them as GEPs.
14138- auto *CloneGEP = GEP->clone();
14139- if (isa<Instruction>(Vec))
14140- CloneGEP->insertBefore(*Builder.GetInsertBlock(),
14141- Builder.GetInsertPoint());
14142- else
14143- CloneGEP->insertBefore(GEP);
14144- if (GEP->hasName())
14145- CloneGEP->takeName(GEP);
14146- Ex = CloneGEP;
1414714184 } else if (auto *VecTy =
1414814185 dyn_cast<FixedVectorType>(Scalar->getType())) {
1414914186 assert(SLPReVec && "FixedVectorType is not expected.");
@@ -14164,14 +14201,15 @@ Value *BoUpSLP::vectorizeTree(
1416414201 if (Scalar->getType() != Ex->getType())
1416514202 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
1416614203 MinBWs.find(E)->second.second);
14167- if ( auto *I = dyn_cast<Instruction>(Ex))
14168- ScalarToEEs[Scalar].try_emplace(
14169- Builder.GetInsertBlock (),
14170- std::make_pair(I, cast<Instruction>( ExV) ));
14204+ auto *I = dyn_cast<Instruction>(Ex);
14205+ ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
14206+ : &F->getEntryBlock (),
14207+ std::make_pair(Ex, ExV));
1417114208 }
1417214209 // The then branch of the previous if may produce constants, since 0
1417314210 // operand might be a constant.
14174- if (auto *ExI = dyn_cast<Instruction>(Ex)) {
14211+ if (auto *ExI = dyn_cast<Instruction>(Ex);
14212+ ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
1417514213 GatherShuffleExtractSeq.insert(ExI);
1417614214 CSEBlocks.insert(ExI->getParent());
1417714215 }
@@ -14192,9 +14230,10 @@ Value *BoUpSLP::vectorizeTree(
1419214230 continue;
1419314231 assert((ExternallyUsedValues.count(Scalar) ||
1419414232 Scalar->hasNUsesOrMore(UsesLimit) ||
14233+ ExternalUsesAsOriginalScalar.contains(Scalar) ||
1419514234 any_of(Scalar->users(),
1419614235 [&](llvm::User *U) {
14197- if (ExternalUsesAsGEPs .contains(U))
14236+ if (ExternalUsesAsOriginalScalar .contains(U))
1419814237 return true;
1419914238 TreeEntry *UseEntry = getTreeEntry(U);
1420014239 return UseEntry &&
0 commit comments