Skip to content

Commit a5e129c

Browse files
committed
[CostModel][X86] getVectorInstrCost - correctly cost v4f32 insertelement into index 0
This is just the MOVSS instruction (SSE41 INSERTPS is still necessary for index != 0) This exposed an issue in VectorCombine::foldInsExtFNeg - we need to use the more general SK_PermuteTwoSrc shuffle kind to allow getShuffleCost to match other shuffle kinds (not just SK_Select).
1 parent 1eed780 commit a5e129c

File tree

3 files changed

+23
-19
lines changed

3 files changed

+23
-19
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4804,9 +4804,12 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
48044804
MVT MScalarTy = LT.second.getScalarType();
48054805
auto IsCheapPInsrPExtrInsertPS = [&]() {
48064806
// Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4807+
// Inserting f32 into index0 is just movss.
48074808
// Also, assume insertps is relatively cheap on all >= SSE41 targets.
48084809
return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
48094810
(MScalarTy.isInteger() && ST->hasSSE41()) ||
4811+
(MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4812+
Opcode == Instruction::InsertElement) ||
48104813
(MScalarTy == MVT::f32 && ST->hasSSE41() &&
48114814
Opcode == Instruction::InsertElement);
48124815
};

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -705,7 +705,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
705705

706706
InstructionCost NewCost =
707707
TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
708-
TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind);
708+
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask,
709+
CostKind);
709710

710711
bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
711712
// If the lengths of the two vectors are not equal,

llvm/test/Analysis/CostModel/X86/vector-insert-value.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -76,58 +76,58 @@ define i32 @insert_double(i32 %arg, double %val, <2 x double> %src128, <4 x doub
7676
define i32 @insert_float(i32 %arg, float %val, <2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
7777
; SSE2-LABEL: 'insert_float'
7878
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
79-
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
79+
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
8080
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
8181
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
82-
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
82+
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
8383
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
8484
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
85-
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
85+
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
8686
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
87-
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
87+
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
8888
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
8989
; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
90-
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
90+
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
9191
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
92-
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
92+
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
9393
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
9494
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
9595
;
9696
; SSE3-LABEL: 'insert_float'
9797
; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
98-
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
98+
; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
9999
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
100100
; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
101-
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
101+
; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
102102
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
103103
; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
104-
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
104+
; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
105105
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
106-
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
106+
; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
107107
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
108108
; SSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
109-
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
109+
; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
110110
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
111-
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
111+
; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
112112
; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
113113
; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
114114
;
115115
; SSSE3-LABEL: 'insert_float'
116116
; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
117-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
117+
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
118118
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
119119
; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
120-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
120+
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
121121
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
122122
; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
123-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
123+
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
124124
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
125-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
125+
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
126126
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
127127
; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
128-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
128+
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
129129
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
130-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
130+
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
131131
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
132132
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
133133
;

0 commit comments

Comments
 (0)