@@ -58,6 +58,19 @@ define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) {
5858 ret <4 x float > %r
5959}
6060
61+ define <2 x float > @ext2_v4f32v2f32 (<4 x float > %x , <2 x float > %y ) {
62+ ; CHECK-LABEL: @ext2_v4f32v2f32(
63+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]]
64+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 poison, i32 3>
65+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[Y:%.*]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
66+ ; CHECK-NEXT: ret <2 x float> [[R]]
67+ ;
68+ %e = extractelement <4 x float > %x , i32 3
69+ %n = fneg float %e
70+ %r = insertelement <2 x float > %y , float %n , i32 1
71+ ret <2 x float > %r
72+ }
73+
6174; Eliminating extract/insert is still profitable. Flags propagate.
6275
6376define <2 x double > @ext1_v2f64 (<2 x double > %x , <2 x double > %y ) {
@@ -85,6 +98,19 @@ define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
8598 ret <4 x double > %r
8699}
87100
101+ define <2 x double > @ext1_v4f64v2f64 (<4 x double > %x , <2 x double > %y ) {
102+ ; CHECK-LABEL: @ext1_v4f64v2f64(
103+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <4 x double> [[X:%.*]]
104+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
105+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
106+ ; CHECK-NEXT: ret <2 x double> [[R]]
107+ ;
108+ %e = extractelement <4 x double > %x , i32 3
109+ %n = fneg nsz double %e
110+ %r = insertelement <2 x double > %y , double %n , i32 1
111+ ret <2 x double > %r
112+ }
113+
88114define <8 x float > @ext7_v8f32 (<8 x float > %x , <8 x float > %y ) {
89115; CHECK-LABEL: @ext7_v8f32(
90116; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
@@ -110,6 +136,19 @@ define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) {
110136 ret <8 x float > %r
111137}
112138
139+ define <4 x float > @ext7_v8f32v4f32 (<8 x float > %x , <4 x float > %y ) {
140+ ; CHECK-LABEL: @ext7_v8f32v4f32(
141+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
142+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7>
143+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
144+ ; CHECK-NEXT: ret <4 x float> [[R]]
145+ ;
146+ %e = extractelement <8 x float > %x , i32 7
147+ %n = fneg float %e
148+ %r = insertelement <4 x float > %y , float %n , i32 3
149+ ret <4 x float > %r
150+ }
151+
113152; Same as above with an extra use of the extracted element.
114153
115154define <8 x float > @ext7_v8f32_use1 (<8 x float > %x , <8 x float > %y ) {
@@ -157,6 +196,29 @@ define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) {
157196 ret <8 x float > %r
158197}
159198
199+ define <4 x float > @ext7_v8f32v4f32_use1 (<8 x float > %x , <4 x float > %y ) {
200+ ; SSE-LABEL: @ext7_v8f32v4f32_use1(
201+ ; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
202+ ; SSE-NEXT: call void @use(float [[E]])
203+ ; SSE-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X]]
204+ ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7>
205+ ; SSE-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
206+ ; SSE-NEXT: ret <4 x float> [[R]]
207+ ;
208+ ; AVX-LABEL: @ext7_v8f32v4f32_use1(
209+ ; AVX-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
210+ ; AVX-NEXT: call void @use(float [[E]])
211+ ; AVX-NEXT: [[N:%.*]] = fneg float [[E]]
212+ ; AVX-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3
213+ ; AVX-NEXT: ret <4 x float> [[R]]
214+ ;
215+ %e = extractelement <8 x float > %x , i32 7
216+ call void @use (float %e )
217+ %n = fneg float %e
218+ %r = insertelement <4 x float > %y , float %n , i32 3
219+ ret <4 x float > %r
220+ }
221+
160222; Negative test - the transform is likely not profitable if the fneg has another use.
161223
162224define <8 x float > @ext7_v8f32_use2 (<8 x float > %x , <8 x float > %y ) {
@@ -189,6 +251,21 @@ define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) {
189251 ret <8 x float > %r
190252}
191253
254+ define <4 x float > @ext7_v8f32v4f32_use2 (<8 x float > %x , <4 x float > %y ) {
255+ ; CHECK-LABEL: @ext7_v8f32v4f32_use2(
256+ ; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
257+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
258+ ; CHECK-NEXT: call void @use(float [[N]])
259+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3
260+ ; CHECK-NEXT: ret <4 x float> [[R]]
261+ ;
262+ %e = extractelement <8 x float > %x , i32 7
263+ %n = fneg float %e
264+ call void @use (float %n )
265+ %r = insertelement <4 x float > %y , float %n , i32 3
266+ ret <4 x float > %r
267+ }
268+
192269; Negative test - can't convert variable index to a shuffle.
193270
194271define <2 x double > @ext_index_var_v2f64 (<2 x double > %x , <2 x double > %y , i32 %index ) {
@@ -217,9 +294,6 @@ define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y,
217294 ret <4 x double > %r
218295}
219296
220- ; Negative test - require same extract/insert index for simple shuffle.
221- ; TODO: We could handle this by adjusting the cost calculation.
222-
223297define <2 x double > @ext1_v2f64_ins0 (<2 x double > %x , <2 x double > %y ) {
224298; CHECK-LABEL: @ext1_v2f64_ins0(
225299; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
@@ -232,7 +306,6 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
232306 ret <2 x double > %r
233307}
234308
235- ; Negative test - extract from an index greater than the vector width of the destination
236309define <2 x double > @ext3_v4f64v2f64 (<4 x double > %x , <2 x double > %y ) {
237310; CHECK-LABEL: @ext3_v4f64v2f64(
238311; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <4 x double> [[X:%.*]]
0 commit comments