@@ -3621,6 +3621,27 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
3621
3621
return Cost;
3622
3622
}
3623
3623
3624
+ // / Shuffles \p Mask in accordance with the given \p SubMask.
3625
+ static void addMask (SmallVectorImpl<int > &Mask, ArrayRef<int > SubMask) {
3626
+ if (SubMask.empty ())
3627
+ return ;
3628
+ if (Mask.empty ()) {
3629
+ Mask.append (SubMask.begin (), SubMask.end ());
3630
+ return ;
3631
+ }
3632
+ SmallVector<int , 4 > NewMask (SubMask.size (), SubMask.size ());
3633
+ int TermValue = std::min (Mask.size (), SubMask.size ());
3634
+ for (int I = 0 , E = SubMask.size (); I < E; ++I) {
3635
+ if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
3636
+ Mask[SubMask[I]] >= TermValue) {
3637
+ NewMask[I] = UndefMaskElem;
3638
+ continue ;
3639
+ }
3640
+ NewMask[I] = Mask[SubMask[I]];
3641
+ }
3642
+ Mask.swap (NewMask);
3643
+ }
3644
+
3624
3645
InstructionCost BoUpSLP::getEntryCost (const TreeEntry *E,
3625
3646
ArrayRef<Value *> VectorizedVals) {
3626
3647
ArrayRef<Value*> VL = E->Scalars ;
@@ -3633,6 +3654,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
3633
3654
else if (auto *IE = dyn_cast<InsertElementInst>(VL[0 ]))
3634
3655
ScalarTy = IE->getOperand (1 )->getType ();
3635
3656
auto *VecTy = FixedVectorType::get (ScalarTy, VL.size ());
3657
+ auto *FinalVecTy = VecTy;
3636
3658
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3637
3659
3638
3660
// If we have computed a smaller type for the expression, update VecTy so
@@ -3643,12 +3665,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
3643
3665
3644
3666
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices .size ();
3645
3667
bool NeedToShuffleReuses = !E->ReuseShuffleIndices .empty ();
3646
- InstructionCost ReuseShuffleCost = 0 ;
3647
- if (NeedToShuffleReuses) {
3648
- ReuseShuffleCost =
3649
- TTI->getShuffleCost (TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
3650
- E->ReuseShuffleIndices );
3651
- }
3668
+ if (NeedToShuffleReuses)
3669
+ FinalVecTy =
3670
+ FixedVectorType::get (VecTy->getElementType (), ReuseShuffleNumbers);
3652
3671
// FIXME: it tries to fix a problem with MSVC buildbots.
3653
3672
TargetTransformInfo &TTIRef = *TTI;
3654
3673
auto &&AdjustExtractsCost = [this , &TTIRef, CostKind, VL, VecTy,
@@ -3737,23 +3756,26 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
3737
3756
dbgs ()
3738
3757
<< " SLP: perfect diamond match for gather bundle that starts with "
3739
3758
<< *VL.front () << " .\n " );
3759
+ if (NeedToShuffleReuses)
3760
+ GatherCost =
3761
+ TTI->getShuffleCost (TargetTransformInfo::SK_PermuteSingleSrc,
3762
+ FinalVecTy, E->ReuseShuffleIndices );
3740
3763
} else {
3741
3764
LLVM_DEBUG (dbgs () << " SLP: shuffled " << Entries.size ()
3742
3765
<< " entries for bundle that starts with "
3743
3766
<< *VL.front () << " .\n " );
3744
3767
// Detected that instead of gather we can emit a shuffle of single/two
3745
3768
// previously vectorized nodes. Add the cost of the permutation rather
3746
3769
// than gather.
3747
- GatherCost = TTI->getShuffleCost (*Shuffle, VecTy, Mask);
3770
+ ::addMask (Mask, E->ReuseShuffleIndices);
3771
+ GatherCost = TTI->getShuffleCost (*Shuffle, FinalVecTy, Mask);
3748
3772
}
3749
- return ReuseShuffleCost + GatherCost;
3773
+ return GatherCost;
3750
3774
}
3751
3775
if (isSplat (VL)) {
3752
3776
// Found the broadcasting of the single scalar, calculate the cost as the
3753
3777
// broadcast.
3754
- return ReuseShuffleCost +
3755
- TTI->getShuffleCost (TargetTransformInfo::SK_Broadcast, VecTy, None,
3756
- 0 );
3778
+ return TTI->getShuffleCost (TargetTransformInfo::SK_Broadcast, VecTy);
3757
3779
}
3758
3780
if (E->getOpcode () == Instruction::ExtractElement && allSameType (VL) &&
3759
3781
allSameBlock (VL) &&
@@ -3771,11 +3793,36 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
3771
3793
InstructionCost Cost =
3772
3794
computeExtractCost (VL, VecTy, *ShuffleKind, Mask, *TTI);
3773
3795
AdjustExtractsCost (Cost, /* IsGather=*/ true );
3774
- return ReuseShuffleCost + Cost;
3796
+ if (NeedToShuffleReuses)
3797
+ Cost += TTI->getShuffleCost (TargetTransformInfo::SK_PermuteSingleSrc,
3798
+ FinalVecTy, E->ReuseShuffleIndices );
3799
+ return Cost;
3775
3800
}
3776
3801
}
3802
+ InstructionCost ReuseShuffleCost = 0 ;
3803
+ if (NeedToShuffleReuses)
3804
+ ReuseShuffleCost = TTI->getShuffleCost (
3805
+ TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices );
3777
3806
return ReuseShuffleCost + getGatherCost (VL);
3778
3807
}
3808
+ InstructionCost CommonCost = 0 ;
3809
+ SmallVector<int > Mask;
3810
+ if (!E->ReorderIndices .empty ()) {
3811
+ SmallVector<int > NewMask;
3812
+ if (E->getOpcode () == Instruction::Store) {
3813
+ // For stores the order is actually a mask.
3814
+ NewMask.resize (E->ReorderIndices .size ());
3815
+ copy (E->ReorderIndices , NewMask.begin ());
3816
+ } else {
3817
+ inversePermutation (E->ReorderIndices , NewMask);
3818
+ }
3819
+ ::addMask (Mask, NewMask);
3820
+ }
3821
+ if (NeedToShuffleReuses)
3822
+ ::addMask (Mask, E->ReuseShuffleIndices);
3823
+ if (!Mask.empty () && !ShuffleVectorInst::isIdentityMask (Mask))
3824
+ CommonCost =
3825
+ TTI->getShuffleCost (TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
3779
3826
assert ((E->State == TreeEntry::Vectorize ||
3780
3827
E->State == TreeEntry::ScatterVectorize) &&
3781
3828
" Unhandled state" );
@@ -3797,34 +3844,28 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
3797
3844
for (unsigned I : E->ReuseShuffleIndices ) {
3798
3845
if (ShuffleOrOp == Instruction::ExtractElement) {
3799
3846
auto *EE = cast<ExtractElementInst>(VL[I]);
3800
- ReuseShuffleCost -= TTI->getVectorInstrCost (
3801
- Instruction::ExtractElement, EE->getVectorOperandType (),
3802
- *getExtractIndex (EE));
3847
+ CommonCost -= TTI->getVectorInstrCost (Instruction::ExtractElement,
3848
+ EE->getVectorOperandType (),
3849
+ *getExtractIndex (EE));
3803
3850
} else {
3804
- ReuseShuffleCost -= TTI->getVectorInstrCost (
3805
- Instruction::ExtractElement, VecTy, Idx);
3851
+ CommonCost -= TTI->getVectorInstrCost (Instruction::ExtractElement,
3852
+ VecTy, Idx);
3806
3853
++Idx;
3807
3854
}
3808
3855
}
3809
3856
Idx = ReuseShuffleNumbers;
3810
3857
for (Value *V : VL) {
3811
3858
if (ShuffleOrOp == Instruction::ExtractElement) {
3812
3859
auto *EE = cast<ExtractElementInst>(V);
3813
- ReuseShuffleCost += TTI->getVectorInstrCost (
3814
- Instruction::ExtractElement, EE->getVectorOperandType (),
3815
- *getExtractIndex (EE));
3860
+ CommonCost += TTI->getVectorInstrCost (Instruction::ExtractElement,
3861
+ EE->getVectorOperandType (),
3862
+ *getExtractIndex (EE));
3816
3863
} else {
3817
3864
--Idx;
3818
- ReuseShuffleCost += TTI->getVectorInstrCost (
3819
- Instruction::ExtractElement, VecTy, Idx);
3865
+ CommonCost += TTI->getVectorInstrCost (Instruction::ExtractElement,
3866
+ VecTy, Idx);
3820
3867
}
3821
3868
}
3822
- CommonCost = ReuseShuffleCost;
3823
- } else if (!E->ReorderIndices .empty ()) {
3824
- SmallVector<int > NewMask;
3825
- inversePermutation (E->ReorderIndices , NewMask);
3826
- CommonCost = TTI->getShuffleCost (
3827
- TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
3828
3869
}
3829
3870
if (ShuffleOrOp == Instruction::ExtractValue) {
3830
3871
for (unsigned I = 0 , E = VL.size (); I < E; ++I) {
@@ -3915,7 +3956,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
3915
3956
TTI->getCastInstrCost (E->getOpcode (), ScalarTy, SrcTy,
3916
3957
TTI::getCastContextHint (VL0), CostKind, VL0);
3917
3958
if (NeedToShuffleReuses) {
3918
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
3959
+ CommonCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
3919
3960
}
3920
3961
3921
3962
// Calculate the cost of this instruction.
@@ -3925,12 +3966,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
3925
3966
InstructionCost VecCost = 0 ;
3926
3967
// Check if the values are candidates to demote.
3927
3968
if (!MinBWs.count (VL0) || VecTy != SrcVecTy) {
3928
- VecCost =
3929
- ReuseShuffleCost +
3930
- TTI->getCastInstrCost (E->getOpcode (), VecTy, SrcVecTy,
3931
- TTI::getCastContextHint (VL0), CostKind, VL0);
3969
+ VecCost = CommonCost + TTI->getCastInstrCost (
3970
+ E->getOpcode (), VecTy, SrcVecTy,
3971
+ TTI::getCastContextHint (VL0), CostKind, VL0);
3932
3972
}
3933
- LLVM_DEBUG (dumpTreeCosts (E, ReuseShuffleCost , VecCost, ScalarCost));
3973
+ LLVM_DEBUG (dumpTreeCosts (E, CommonCost , VecCost, ScalarCost));
3934
3974
return VecCost - ScalarCost;
3935
3975
}
3936
3976
case Instruction::FCmp:
@@ -3941,7 +3981,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
3941
3981
TTI->getCmpSelInstrCost (E->getOpcode (), ScalarTy, Builder.getInt1Ty (),
3942
3982
CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
3943
3983
if (NeedToShuffleReuses) {
3944
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
3984
+ CommonCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
3945
3985
}
3946
3986
auto *MaskTy = FixedVectorType::get (Builder.getInt1Ty (), VL.size ());
3947
3987
InstructionCost ScalarCost = VecTy->getNumElements () * ScalarEltCost;
@@ -3982,8 +4022,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
3982
4022
CmpInst::BAD_ICMP_PREDICATE, CostKind);
3983
4023
VecCost = std::min (VecCost, IntrinsicCost);
3984
4024
}
3985
- LLVM_DEBUG (dumpTreeCosts (E, ReuseShuffleCost , VecCost, ScalarCost));
3986
- return ReuseShuffleCost + VecCost - ScalarCost;
4025
+ LLVM_DEBUG (dumpTreeCosts (E, CommonCost , VecCost, ScalarCost));
4026
+ return CommonCost + VecCost - ScalarCost;
3987
4027
}
3988
4028
case Instruction::FNeg:
3989
4029
case Instruction::Add:
@@ -4046,14 +4086,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
4046
4086
TTI->getArithmeticInstrCost (E->getOpcode (), ScalarTy, CostKind, Op1VK,
4047
4087
Op2VK, Op1VP, Op2VP, Operands, VL0);
4048
4088
if (NeedToShuffleReuses) {
4049
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
4089
+ CommonCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
4050
4090
}
4051
4091
InstructionCost ScalarCost = VecTy->getNumElements () * ScalarEltCost;
4052
4092
InstructionCost VecCost =
4053
4093
TTI->getArithmeticInstrCost (E->getOpcode (), VecTy, CostKind, Op1VK,
4054
4094
Op2VK, Op1VP, Op2VP, Operands, VL0);
4055
- LLVM_DEBUG (dumpTreeCosts (E, ReuseShuffleCost , VecCost, ScalarCost));
4056
- return ReuseShuffleCost + VecCost - ScalarCost;
4095
+ LLVM_DEBUG (dumpTreeCosts (E, CommonCost , VecCost, ScalarCost));
4096
+ return CommonCost + VecCost - ScalarCost;
4057
4097
}
4058
4098
case Instruction::GetElementPtr: {
4059
4099
TargetTransformInfo::OperandValueKind Op1VK =
@@ -4064,21 +4104,21 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
4064
4104
InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost (
4065
4105
Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
4066
4106
if (NeedToShuffleReuses) {
4067
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
4107
+ CommonCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
4068
4108
}
4069
4109
InstructionCost ScalarCost = VecTy->getNumElements () * ScalarEltCost;
4070
4110
InstructionCost VecCost = TTI->getArithmeticInstrCost (
4071
4111
Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
4072
- LLVM_DEBUG (dumpTreeCosts (E, ReuseShuffleCost , VecCost, ScalarCost));
4073
- return ReuseShuffleCost + VecCost - ScalarCost;
4112
+ LLVM_DEBUG (dumpTreeCosts (E, CommonCost , VecCost, ScalarCost));
4113
+ return CommonCost + VecCost - ScalarCost;
4074
4114
}
4075
4115
case Instruction::Load: {
4076
4116
// Cost of wide load - cost of scalar loads.
4077
4117
Align Alignment = cast<LoadInst>(VL0)->getAlign ();
4078
4118
InstructionCost ScalarEltCost = TTI->getMemoryOpCost (
4079
4119
Instruction::Load, ScalarTy, Alignment, 0 , CostKind, VL0);
4080
4120
if (NeedToShuffleReuses) {
4081
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
4121
+ CommonCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
4082
4122
}
4083
4123
InstructionCost ScalarLdCost = VecTy->getNumElements () * ScalarEltCost;
4084
4124
InstructionCost VecLdCost;
@@ -4095,14 +4135,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
4095
4135
Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand (),
4096
4136
/* VariableMask=*/ false , Alignment, CostKind, VL0);
4097
4137
}
4098
- if (!NeedToShuffleReuses && !E->ReorderIndices .empty ()) {
4099
- SmallVector<int > NewMask;
4100
- inversePermutation (E->ReorderIndices , NewMask);
4101
- VecLdCost += TTI->getShuffleCost (
4102
- TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
4103
- }
4104
- LLVM_DEBUG (dumpTreeCosts (E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
4105
- return ReuseShuffleCost + VecLdCost - ScalarLdCost;
4138
+ LLVM_DEBUG (dumpTreeCosts (E, CommonCost, VecLdCost, ScalarLdCost));
4139
+ return CommonCost + VecLdCost - ScalarLdCost;
4106
4140
}
4107
4141
case Instruction::Store: {
4108
4142
// We know that we can merge the stores. Calculate the cost.
@@ -4115,14 +4149,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
4115
4149
InstructionCost ScalarStCost = VecTy->getNumElements () * ScalarEltCost;
4116
4150
InstructionCost VecStCost = TTI->getMemoryOpCost (
4117
4151
Instruction::Store, VecTy, Alignment, 0 , CostKind, VL0);
4118
- if (IsReorder) {
4119
- SmallVector<int > NewMask;
4120
- inversePermutation (E->ReorderIndices , NewMask);
4121
- VecStCost += TTI->getShuffleCost (
4122
- TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
4123
- }
4124
- LLVM_DEBUG (dumpTreeCosts (E, ReuseShuffleCost, VecStCost, ScalarStCost));
4125
- return VecStCost - ScalarStCost;
4152
+ LLVM_DEBUG (dumpTreeCosts (E, CommonCost, VecStCost, ScalarStCost));
4153
+ return CommonCost + VecStCost - ScalarStCost;
4126
4154
}
4127
4155
case Instruction::Call: {
4128
4156
CallInst *CI = cast<CallInst>(VL0);
@@ -4133,7 +4161,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
4133
4161
InstructionCost ScalarEltCost =
4134
4162
TTI->getIntrinsicInstrCost (CostAttrs, CostKind);
4135
4163
if (NeedToShuffleReuses) {
4136
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
4164
+ CommonCost -= (ReuseShuffleNumbers - VL.size ()) * ScalarEltCost;
4137
4165
}
4138
4166
InstructionCost ScalarCallCost = VecTy->getNumElements () * ScalarEltCost;
4139
4167
@@ -4145,7 +4173,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
4145
4173
<< " (" << VecCallCost << " -" << ScalarCallCost << " )"
4146
4174
<< " for " << *CI << " \n " );
4147
4175
4148
- return ReuseShuffleCost + VecCallCost - ScalarCallCost;
4176
+ return CommonCost + VecCallCost - ScalarCallCost;
4149
4177
}
4150
4178
case Instruction::ShuffleVector: {
4151
4179
assert (E->isAltShuffle () &&
@@ -4158,11 +4186,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
4158
4186
if (NeedToShuffleReuses) {
4159
4187
for (unsigned Idx : E->ReuseShuffleIndices ) {
4160
4188
Instruction *I = cast<Instruction>(VL[Idx]);
4161
- ReuseShuffleCost -= TTI->getInstructionCost (I, CostKind);
4189
+ CommonCost -= TTI->getInstructionCost (I, CostKind);
4162
4190
}
4163
4191
for (Value *V : VL) {
4164
4192
Instruction *I = cast<Instruction>(V);
4165
- ReuseShuffleCost += TTI->getInstructionCost (I, CostKind);
4193
+ CommonCost += TTI->getInstructionCost (I, CostKind);
4166
4194
}
4167
4195
}
4168
4196
for (Value *V : VL) {
@@ -4196,8 +4224,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
4196
4224
}
4197
4225
VecCost +=
4198
4226
TTI->getShuffleCost (TargetTransformInfo::SK_Select, VecTy, Mask, 0 );
4199
- LLVM_DEBUG (dumpTreeCosts (E, ReuseShuffleCost , VecCost, ScalarCost));
4200
- return ReuseShuffleCost + VecCost - ScalarCost;
4227
+ LLVM_DEBUG (dumpTreeCosts (E, CommonCost , VecCost, ScalarCost));
4228
+ return CommonCost + VecCost - ScalarCost;
4201
4229
}
4202
4230
default :
4203
4231
llvm_unreachable (" Unknown instruction" );
@@ -4929,25 +4957,7 @@ class ShuffleInstructionBuilder {
4929
4957
addMask (NewMask);
4930
4958
}
4931
4959
4932
- void addMask (ArrayRef<int > SubMask) {
4933
- if (SubMask.empty ())
4934
- return ;
4935
- if (Mask.empty ()) {
4936
- Mask.append (SubMask.begin (), SubMask.end ());
4937
- return ;
4938
- }
4939
- SmallVector<int , 4 > NewMask (SubMask.size (), SubMask.size ());
4940
- int TermValue = std::min (Mask.size (), SubMask.size ());
4941
- for (int I = 0 , E = SubMask.size (); I < E; ++I) {
4942
- if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
4943
- Mask[SubMask[I]] >= TermValue) {
4944
- NewMask[I] = UndefMaskElem;
4945
- continue ;
4946
- }
4947
- NewMask[I] = Mask[SubMask[I]];
4948
- }
4949
- Mask.swap (NewMask);
4950
- }
4960
+ void addMask (ArrayRef<int > SubMask) { ::addMask (Mask, SubMask); }
4951
4961
4952
4962
Value *finalize (Value *V) {
4953
4963
IsFinalized = true ;
0 commit comments