@@ -18,6 +18,19 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
1818 ret <4 x float > %r
1919}
2020
21+ define <4 x float > @ext0_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
22+ ; CHECK-LABEL: @ext0_v2f32v4f32(
23+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
24+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
25+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0
26+ ; CHECK-NEXT: ret <4 x float> [[R]]
27+ ;
28+ %e = extractelement <2 x float > %x , i32 0
29+ %n = fneg float %e
30+ %r = insertelement <4 x float > %y , float %n , i32 0
31+ ret <4 x float > %r
32+ }
33+
2134; Eliminating extract/insert is profitable.
2235
2336define <4 x float > @ext2_v4f32 (<4 x float > %x , <4 x float > %y ) {
@@ -32,6 +45,19 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
3245 ret <4 x float > %r
3346}
3447
48+ define <4 x float > @ext2_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
49+ ; CHECK-LABEL: @ext2_v2f32v4f32(
50+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 2
51+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
52+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 2
53+ ; CHECK-NEXT: ret <4 x float> [[R]]
54+ ;
55+ %e = extractelement <2 x float > %x , i32 2
56+ %n = fneg float %e
57+ %r = insertelement <4 x float > %y , float %n , i32 2
58+ ret <4 x float > %r
59+ }
60+
3561; Eliminating extract/insert is still profitable. Flags propagate.
3662
3763define <2 x double > @ext1_v2f64 (<2 x double > %x , <2 x double > %y ) {
@@ -46,6 +72,19 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
4672 ret <2 x double > %r
4773}
4874
75+ define <4 x double > @ext1_v2f64v4f64 (<2 x double > %x , <4 x double > %y ) {
76+ ; CHECK-LABEL: @ext1_v2f64v4f64(
77+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
78+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
79+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1
80+ ; CHECK-NEXT: ret <4 x double> [[R]]
81+ ;
82+ %e = extractelement <2 x double > %x , i32 1
83+ %n = fneg nsz double %e
84+ %r = insertelement <4 x double > %y , double %n , i32 1
85+ ret <4 x double > %r
86+ }
87+
4988; The vector fneg would cost twice as much as the scalar op with SSE,
5089; so we don't transform there (the shuffle would also be more expensive).
5190
@@ -67,6 +106,19 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
67106 ret <8 x float > %r
68107}
69108
109+ define <8 x float > @ext7_v4f32v8f32 (<4 x float > %x , <8 x float > %y ) {
110+ ; CHECK-LABEL: @ext7_v4f32v8f32(
111+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
112+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
113+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
114+ ; CHECK-NEXT: ret <8 x float> [[R]]
115+ ;
116+ %e = extractelement <4 x float > %x , i32 3
117+ %n = fneg float %e
118+ %r = insertelement <8 x float > %y , float %n , i32 7
119+ ret <8 x float > %r
120+ }
121+
70122; Same as above with an extra use of the extracted element.
71123
72124define <8 x float > @ext7_v8f32_use1 (<8 x float > %x , <8 x float > %y ) {
@@ -91,6 +143,21 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
91143 ret <8 x float > %r
92144}
93145
146+ define <8 x float > @ext7_v4f32v8f32_use1 (<4 x float > %x , <8 x float > %y ) {
147+ ; CHECK-LABEL: @ext7_v4f32v8f32_use1(
148+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
149+ ; CHECK-NEXT: call void @use(float [[E]])
150+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
151+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
152+ ; CHECK-NEXT: ret <8 x float> [[R]]
153+ ;
154+ %e = extractelement <4 x float > %x , i32 3
155+ call void @use (float %e )
156+ %n = fneg float %e
157+ %r = insertelement <8 x float > %y , float %n , i32 3
158+ ret <8 x float > %r
159+ }
160+
94161; Negative test - the transform is likely not profitable if the fneg has another use.
95162
96163define <8 x float > @ext7_v8f32_use2 (<8 x float > %x , <8 x float > %y ) {
@@ -108,6 +175,21 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
108175 ret <8 x float > %r
109176}
110177
178+ define <8 x float > @ext7_v4f32v8f32_use2 (<4 x float > %x , <8 x float > %y ) {
179+ ; CHECK-LABEL: @ext7_v4f32v8f32_use2(
180+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
181+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
182+ ; CHECK-NEXT: call void @use(float [[N]])
183+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
184+ ; CHECK-NEXT: ret <8 x float> [[R]]
185+ ;
186+ %e = extractelement <4 x float > %x , i32 3
187+ %n = fneg float %e
188+ call void @use (float %n )
189+ %r = insertelement <8 x float > %y , float %n , i32 3
190+ ret <8 x float > %r
191+ }
192+
111193; Negative test - can't convert variable index to a shuffle.
112194
113195define <2 x double > @ext_index_var_v2f64 (<2 x double > %x , <2 x double > %y , i32 %index ) {
@@ -123,6 +205,19 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %
123205 ret <2 x double > %r
124206}
125207
208+ define <4 x double > @ext_index_var_v2f64v4f64 (<2 x double > %x , <4 x double > %y , i32 %index ) {
209+ ; CHECK-LABEL: @ext_index_var_v2f64v4f64(
210+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]]
211+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
212+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]]
213+ ; CHECK-NEXT: ret <4 x double> [[R]]
214+ ;
215+ %e = extractelement <2 x double > %x , i32 %index
216+ %n = fneg nsz double %e
217+ %r = insertelement <4 x double > %y , double %n , i32 %index
218+ ret <4 x double > %r
219+ }
220+
126221; Negative test - require same extract/insert index for simple shuffle.
127222; TODO: We could handle this by adjusting the cost calculation.
128223
@@ -139,6 +234,19 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
139234 ret <2 x double > %r
140235}
141236
237+ define <4 x double > @ext1_v2f64v4f64_ins0 (<2 x double > %x , <4 x double > %y ) {
238+ ; CHECK-LABEL: @ext1_v2f64v4f64_ins0(
239+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
240+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
241+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
242+ ; CHECK-NEXT: ret <4 x double> [[R]]
243+ ;
244+ %e = extractelement <2 x double > %x , i32 1
245+ %n = fneg nsz double %e
246+ %r = insertelement <4 x double > %y , double %n , i32 0
247+ ret <4 x double > %r
248+ }
249+
142250; Negative test - avoid changing poison ops
143251
144252define <4 x float > @ext12_v4f32 (<4 x float > %x , <4 x float > %y ) {
@@ -154,6 +262,19 @@ define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) {
154262 ret <4 x float > %r
155263}
156264
265+ define <4 x float > @ext12_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
266+ ; CHECK-LABEL: @ext12_v2f32v4f32(
267+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6
268+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
269+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12
270+ ; CHECK-NEXT: ret <4 x float> [[R]]
271+ ;
272+ %e = extractelement <2 x float > %x , i32 6
273+ %n = fneg float %e
274+ %r = insertelement <4 x float > %y , float %n , i32 12
275+ ret <4 x float > %r
276+ }
277+
157278; This used to crash because we assumed matching a true, unary fneg instruction.
158279
159280define <2 x float > @ext1_v2f32_fsub (<2 x float > %x ) {
@@ -181,3 +302,16 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) {
181302 %r = insertelement <2 x float > %y , float %s , i32 1
182303 ret <2 x float > %r
183304}
305+
306+ define <4 x float > @ext1_v2f32v4f32_fsub_fmf (<2 x float > %x , <4 x float > %y ) {
307+ ; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf(
308+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
309+ ; CHECK-NEXT: [[S:%.*]] = fsub nnan nsz float 0.000000e+00, [[E]]
310+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[S]], i32 1
311+ ; CHECK-NEXT: ret <4 x float> [[R]]
312+ ;
313+ %e = extractelement <2 x float > %x , i32 1
314+ %s = fsub nsz nnan float 0 .0 , %e
315+ %r = insertelement <4 x float > %y , float %s , i32 1
316+ ret <4 x float > %r
317+ }
0 commit comments