@@ -11571,6 +11571,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1157111571 DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
1157211572 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
1157311573 DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
11574+ SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
1157411575 for (ExternalUser &EU : ExternalUses) {
1157511576 // Uses by ephemeral values are free (because the ephemeral value will be
1157611577 // removed prior to code generation, and so the extraction will be
@@ -11706,7 +11707,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1170611707 // Can use original instruction, if no operands vectorized or they are
1170711708 // marked as externally used already.
1170811709 auto *Inst = cast<Instruction>(EU.Scalar);
11709- bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
11710+ InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
11711+ auto OperandIsScalar = [&](Value *V) {
1171011712 if (!getTreeEntry(V)) {
1171111713 // Some extractelements might be not vectorized, but
1171211714 // transformed into shuffle and removed from the function,
@@ -11716,9 +11718,23 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1171611718 return true;
1171711719 }
1171811720 return ValueToExtUses->contains(V);
11719- });
11721+ };
11722+ bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
11723+ bool CanBeUsedAsScalarCast = false;
11724+ if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
11725+ if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
11726+ Op && all_of(Op->operands(), OperandIsScalar)) {
11727+ InstructionCost OpCost =
11728+ (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
11729+ ? TTI->getInstructionCost(Op, CostKind)
11730+ : 0;
11731+ if (ScalarCost + OpCost <= ExtraCost) {
11732+ CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
11733+ ScalarCost += OpCost;
11734+ }
11735+ }
11736+ }
1172011737 if (CanBeUsedAsScalar) {
11721- InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
1172211738 bool KeepScalar = ScalarCost <= ExtraCost;
1172311739 // Try to keep original scalar if the user is the phi node from the same
1172411740 // block as the root phis, currently vectorized. It allows to keep
@@ -11774,12 +11790,34 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1177411790 ExtraCost = ScalarCost;
1177511791 if (!IsPhiInLoop(EU))
1177611792 ExtractsCount[Entry].insert(Inst);
11793+ if (CanBeUsedAsScalarCast) {
11794+ ScalarOpsFromCasts.insert(Inst->getOperand(0));
11795+ // Update the users of the operands of the cast operand to avoid
11796+ // compiler crash.
11797+ if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
11798+ for_each(IOp->operands(), [&](Value *V) {
11799+ auto It = ValueToExtUses->find(V);
11800+ if (It != ValueToExtUses->end()) {
11801+ // Replace all uses to avoid compiler crash.
11802+ ExternalUses[It->second].User = nullptr;
11803+ }
11804+ });
11805+ }
11806+ }
1177711807 }
1177811808 }
1177911809 }
1178011810
1178111811 ExtractCost += ExtraCost;
1178211812 }
11813+ // Insert externals for extract of operands of casts to be emitted as scalars
11814+ // instead of extractelement.
11815+ for (Value *V : ScalarOpsFromCasts) {
11816+ ExternalUsesAsOriginalScalar.insert(V);
11817+ if (const TreeEntry *E = getTreeEntry(V)) {
11818+ ExternalUses.emplace_back(V, nullptr, E->findLaneForValue(V));
11819+ }
11820+ }
1178311821 // Add reduced value cost, if resized.
1178411822 if (!VectorizedVals.empty()) {
1178511823 const TreeEntry &Root = *VectorizableTree.front();
@@ -13095,7 +13133,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1309513133 UniqueBases.insert(VecBase);
1309613134 // If the only one use is vectorized - can delete the extractelement
1309713135 // itself.
13098- if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
13136+ if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
13137+ (NumParts != 1 && count(E->Scalars, EI) > 1) ||
1309913138 any_of(EI->users(), [&](User *U) {
1310013139 const TreeEntry *UTE = R.getTreeEntry(U);
1310113140 return !UTE || R.MultiNodeScalars.contains(U) ||
0 commit comments