@@ -18,6 +18,19 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
1818 ret <4 x float > %r
1919}
2020
21+ define <4 x float > @ext0_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
22+ ; CHECK-LABEL: @ext0_v2f32v4f32(
23+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
24+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
25+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0
26+ ; CHECK-NEXT: ret <4 x float> [[R]]
27+ ;
28+ %e = extractelement <2 x float > %x , i32 0
29+ %n = fneg float %e
30+ %r = insertelement <4 x float > %y , float %n , i32 0
31+ ret <4 x float > %r
32+ }
33+
2134; Eliminating extract/insert is profitable.
2235
2336define <4 x float > @ext2_v4f32 (<4 x float > %x , <4 x float > %y ) {
@@ -32,6 +45,19 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
3245 ret <4 x float > %r
3346}
3447
48+ define <4 x float > @ext2_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
49+ ; CHECK-LABEL: @ext2_v2f32v4f32(
50+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]]
51+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 poison>
52+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
53+ ; CHECK-NEXT: ret <4 x float> [[R]]
54+ ;
55+ %e = extractelement <2 x float > %x , i32 2
56+ %n = fneg float %e
57+ %r = insertelement <4 x float > %y , float %n , i32 2
58+ ret <4 x float > %r
59+ }
60+
3561; Eliminating extract/insert is still profitable. Flags propagate.
3662
3763define <2 x double > @ext1_v2f64 (<2 x double > %x , <2 x double > %y ) {
@@ -46,6 +72,25 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
4672 ret <2 x double > %r
4773}
4874
75+ define <4 x double > @ext1_v2f64v4f64 (<2 x double > %x , <4 x double > %y ) {
76+ ; SSE-LABEL: @ext1_v2f64v4f64(
77+ ; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
78+ ; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]]
79+ ; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1
80+ ; SSE-NEXT: ret <4 x double> [[R]]
81+ ;
82+ ; AVX-LABEL: @ext1_v2f64v4f64(
83+ ; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
84+ ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
85+ ; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
86+ ; AVX-NEXT: ret <4 x double> [[R]]
87+ ;
88+ %e = extractelement <2 x double > %x , i32 1
89+ %n = fneg nsz double %e
90+ %r = insertelement <4 x double > %y , double %n , i32 1
91+ ret <4 x double > %r
92+ }
93+
4994; The vector fneg would cost twice as much as the scalar op with SSE,
5095; so we don't transform there (the shuffle would also be more expensive).
5196
@@ -67,6 +112,19 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
67112 ret <8 x float > %r
68113}
69114
115+ define <8 x float > @ext7_v4f32v8f32 (<4 x float > %x , <8 x float > %y ) {
116+ ; CHECK-LABEL: @ext7_v4f32v8f32(
117+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
118+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
119+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
120+ ; CHECK-NEXT: ret <8 x float> [[R]]
121+ ;
122+ %e = extractelement <4 x float > %x , i32 3
123+ %n = fneg float %e
124+ %r = insertelement <8 x float > %y , float %n , i32 7
125+ ret <8 x float > %r
126+ }
127+
70128; Same as above with an extra use of the extracted element.
71129
72130define <8 x float > @ext7_v8f32_use1 (<8 x float > %x , <8 x float > %y ) {
@@ -91,6 +149,21 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
91149 ret <8 x float > %r
92150}
93151
152+ define <8 x float > @ext7_v4f32v8f32_use1 (<4 x float > %x , <8 x float > %y ) {
153+ ; CHECK-LABEL: @ext7_v4f32v8f32_use1(
154+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
155+ ; CHECK-NEXT: call void @use(float [[E]])
156+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
157+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
158+ ; CHECK-NEXT: ret <8 x float> [[R]]
159+ ;
160+ %e = extractelement <4 x float > %x , i32 3
161+ call void @use (float %e )
162+ %n = fneg float %e
163+ %r = insertelement <8 x float > %y , float %n , i32 3
164+ ret <8 x float > %r
165+ }
166+
94167; Negative test - the transform is likely not profitable if the fneg has another use.
95168
96169define <8 x float > @ext7_v8f32_use2 (<8 x float > %x , <8 x float > %y ) {
@@ -108,6 +181,21 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
108181 ret <8 x float > %r
109182}
110183
184+ define <8 x float > @ext7_v4f32v8f32_use2 (<4 x float > %x , <8 x float > %y ) {
185+ ; CHECK-LABEL: @ext7_v4f32v8f32_use2(
186+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
187+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
188+ ; CHECK-NEXT: call void @use(float [[N]])
189+ ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
190+ ; CHECK-NEXT: ret <8 x float> [[R]]
191+ ;
192+ %e = extractelement <4 x float > %x , i32 3
193+ %n = fneg float %e
194+ call void @use (float %n )
195+ %r = insertelement <8 x float > %y , float %n , i32 3
196+ ret <8 x float > %r
197+ }
198+
111199; Negative test - can't convert variable index to a shuffle.
112200
113201define <2 x double > @ext_index_var_v2f64 (<2 x double > %x , <2 x double > %y , i32 %index ) {
@@ -123,6 +211,19 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %
123211 ret <2 x double > %r
124212}
125213
214+ define <4 x double > @ext_index_var_v2f64v4f64 (<2 x double > %x , <4 x double > %y , i32 %index ) {
215+ ; CHECK-LABEL: @ext_index_var_v2f64v4f64(
216+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]]
217+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
218+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]]
219+ ; CHECK-NEXT: ret <4 x double> [[R]]
220+ ;
221+ %e = extractelement <2 x double > %x , i32 %index
222+ %n = fneg nsz double %e
223+ %r = insertelement <4 x double > %y , double %n , i32 %index
224+ ret <4 x double > %r
225+ }
226+
126227; Negative test - require same extract/insert index for simple shuffle.
127228; TODO: We could handle this by adjusting the cost calculation.
128229
@@ -139,6 +240,33 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
139240 ret <2 x double > %r
140241}
141242
243+ ; Negative test - extract from an index greater than the vector width of the destination
244+ define <2 x double > @ext3_v4f64v2f64 (<4 x double > %x , <2 x double > %y ) {
245+ ; CHECK-LABEL: @ext3_v4f64v2f64(
246+ ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3
247+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
248+ ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1
249+ ; CHECK-NEXT: ret <2 x double> [[R]]
250+ ;
251+ %e = extractelement <4 x double > %x , i32 3
252+ %n = fneg nsz double %e
253+ %r = insertelement <2 x double > %y , double %n , i32 1
254+ ret <2 x double > %r
255+ }
256+
257+ define <4 x double > @ext1_v2f64v4f64_ins0 (<2 x double > %x , <4 x double > %y ) {
258+ ; CHECK-LABEL: @ext1_v2f64v4f64_ins0(
259+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
260+ ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
261+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
262+ ; CHECK-NEXT: ret <4 x double> [[R]]
263+ ;
264+ %e = extractelement <2 x double > %x , i32 1
265+ %n = fneg nsz double %e
266+ %r = insertelement <4 x double > %y , double %n , i32 0
267+ ret <4 x double > %r
268+ }
269+
142270; Negative test - avoid changing poison ops
143271
144272define <4 x float > @ext12_v4f32 (<4 x float > %x , <4 x float > %y ) {
@@ -154,6 +282,19 @@ define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) {
154282 ret <4 x float > %r
155283}
156284
285+ define <4 x float > @ext12_v2f32v4f32 (<2 x float > %x , <4 x float > %y ) {
286+ ; CHECK-LABEL: @ext12_v2f32v4f32(
287+ ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6
288+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
289+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12
290+ ; CHECK-NEXT: ret <4 x float> [[R]]
291+ ;
292+ %e = extractelement <2 x float > %x , i32 6
293+ %n = fneg float %e
294+ %r = insertelement <4 x float > %y , float %n , i32 12
295+ ret <4 x float > %r
296+ }
297+
157298; This used to crash because we assumed matching a true, unary fneg instruction.
158299
159300define <2 x float > @ext1_v2f32_fsub (<2 x float > %x ) {
@@ -181,3 +322,16 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) {
181322 %r = insertelement <2 x float > %y , float %s , i32 1
182323 ret <2 x float > %r
183324}
325+
326+ define <4 x float > @ext1_v2f32v4f32_fsub_fmf (<2 x float > %x , <4 x float > %y ) {
327+ ; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf(
328+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]]
329+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
330+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
331+ ; CHECK-NEXT: ret <4 x float> [[R]]
332+ ;
333+ %e = extractelement <2 x float > %x , i32 1
334+ %s = fsub nsz nnan float 0 .0 , %e
335+ %r = insertelement <4 x float > %y , float %s , i32 1
336+ ret <4 x float > %r
337+ }
0 commit comments