@@ -58,6 +58,19 @@ define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) {
5858 ret <4 x float > %r
5959}
6060
61+ define <2 x float > @ext2_v4f32v2f32 (<4 x float > %x , <2 x float > %y ) {
62+ ; CHECK-LABEL: @ext2_v4f32v2f32(
63+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]]
64+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 poison, i32 3>
65+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[Y:%.*]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
66+ ; CHECK-NEXT: ret <2 x float> [[R]]
67+ ;
68+ %e = extractelement <4 x float > %x , i32 3
69+ %n = fneg float %e
70+ %r = insertelement <2 x float > %y , float %n , i32 1
71+ ret <2 x float > %r
72+ }
73+
6174; Eliminating extract/insert is still profitable. Flags propagate.
6275
6376define <2 x double > @ext1_v2f64 (<2 x double > %x , <2 x double > %y ) {
@@ -73,24 +86,31 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
7386}
7487
7588define <4 x double > @ext1_v2f64v4f64 (<2 x double > %x , <4 x double > %y ) {
76- ; SSE-LABEL: @ext1_v2f64v4f64(
77- ; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
78- ; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]]
79- ; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1
80- ; SSE-NEXT: ret <4 x double> [[R]]
81- ;
82- ; AVX-LABEL: @ext1_v2f64v4f64(
83- ; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
84- ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
85- ; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
86- ; AVX-NEXT: ret <4 x double> [[R]]
89+ ; CHECK-LABEL: @ext1_v2f64v4f64(
90+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
91+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
92+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
93+ ; CHECK-NEXT: ret <4 x double> [[R]]
8794;
8895 %e = extractelement <2 x double > %x , i32 1
8996 %n = fneg nsz double %e
9097 %r = insertelement <4 x double > %y , double %n , i32 1
9198 ret <4 x double > %r
9299}
93100
101+ define <2 x double > @ext1_v4f64v2f64 (<4 x double > %x , <2 x double > %y ) {
102+ ; CHECK-LABEL: @ext1_v4f64v2f64(
103+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <4 x double> [[X:%.*]]
104+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
105+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
106+ ; CHECK-NEXT: ret <2 x double> [[R]]
107+ ;
108+ %e = extractelement <4 x double > %x , i32 3
109+ %n = fneg nsz double %e
110+ %r = insertelement <2 x double > %y , double %n , i32 1
111+ ret <2 x double > %r
112+ }
113+
94114define <8 x float > @ext7_v8f32 (<8 x float > %x , <8 x float > %y ) {
95115; CHECK-LABEL: @ext7_v8f32(
96116; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
@@ -105,9 +125,9 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
105125
106126define <8 x float > @ext7_v4f32v8f32 (<4 x float > %x , <8 x float > %y ) {
107127; CHECK-LABEL: @ext7_v4f32v8f32(
108- ; CHECK-NEXT: [[E :%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
109- ; CHECK-NEXT: [[N :%.*]] = fneg float [[E]]
110- ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N ]], i32 7
128+ ; CHECK-NEXT: [[TMP1 :%.*]] = fneg <4 x float> [[X:%.*]]
129+ ; CHECK-NEXT: [[TMP2 :%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
130+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP2 ]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
111131; CHECK-NEXT: ret <8 x float> [[R]]
112132;
113133 %e = extractelement <4 x float > %x , i32 3
@@ -116,6 +136,19 @@ define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) {
116136 ret <8 x float > %r
117137}
118138
139+ define <4 x float > @ext7_v8f32v4f32 (<8 x float > %x , <4 x float > %y ) {
140+ ; CHECK-LABEL: @ext7_v8f32v4f32(
141+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
142+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7>
143+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
144+ ; CHECK-NEXT: ret <4 x float> [[R]]
145+ ;
146+ %e = extractelement <8 x float > %x , i32 7
147+ %n = fneg float %e
148+ %r = insertelement <4 x float > %y , float %n , i32 3
149+ ret <4 x float > %r
150+ }
151+
119152; Same as above with an extra use of the extracted element.
120153
121154define <8 x float > @ext7_v8f32_use1 (<8 x float > %x , <8 x float > %y ) {
@@ -141,12 +174,20 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
141174}
142175
143176define <8 x float > @ext7_v4f32v8f32_use1 (<4 x float > %x , <8 x float > %y ) {
144- ; CHECK-LABEL: @ext7_v4f32v8f32_use1(
145- ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
146- ; CHECK-NEXT: call void @use(float [[E]])
147- ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
148- ; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
149- ; CHECK-NEXT: ret <8 x float> [[R]]
177+ ; SSE-LABEL: @ext7_v4f32v8f32_use1(
178+ ; SSE-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
179+ ; SSE-NEXT: call void @use(float [[E]])
180+ ; SSE-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X]]
181+ ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
182+ ; SSE-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 4, i32 5, i32 6, i32 7>
183+ ; SSE-NEXT: ret <8 x float> [[R]]
184+ ;
185+ ; AVX-LABEL: @ext7_v4f32v8f32_use1(
186+ ; AVX-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
187+ ; AVX-NEXT: call void @use(float [[E]])
188+ ; AVX-NEXT: [[N:%.*]] = fneg float [[E]]
189+ ; AVX-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
190+ ; AVX-NEXT: ret <8 x float> [[R]]
150191;
151192 %e = extractelement <4 x float > %x , i32 3
152193 call void @use (float %e )
@@ -155,6 +196,29 @@ define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) {
155196 ret <8 x float > %r
156197}
157198
199+ define <4 x float > @ext7_v8f32v4f32_use1 (<8 x float > %x , <4 x float > %y ) {
200+ ; SSE-LABEL: @ext7_v8f32v4f32_use1(
201+ ; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
202+ ; SSE-NEXT: call void @use(float [[E]])
203+ ; SSE-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X]]
204+ ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7>
205+ ; SSE-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
206+ ; SSE-NEXT: ret <4 x float> [[R]]
207+ ;
208+ ; AVX-LABEL: @ext7_v8f32v4f32_use1(
209+ ; AVX-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
210+ ; AVX-NEXT: call void @use(float [[E]])
211+ ; AVX-NEXT: [[N:%.*]] = fneg float [[E]]
212+ ; AVX-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3
213+ ; AVX-NEXT: ret <4 x float> [[R]]
214+ ;
215+ %e = extractelement <8 x float > %x , i32 7
216+ call void @use (float %e )
217+ %n = fneg float %e
218+ %r = insertelement <4 x float > %y , float %n , i32 3
219+ ret <4 x float > %r
220+ }
221+
158222; Negative test - the transform is likely not profitable if the fneg has another use.
159223
160224define <8 x float > @ext7_v8f32_use2 (<8 x float > %x , <8 x float > %y ) {
@@ -187,6 +251,21 @@ define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) {
187251 ret <8 x float > %r
188252}
189253
254+ define <4 x float > @ext7_v8f32v4f32_use2 (<8 x float > %x , <4 x float > %y ) {
255+ ; CHECK-LABEL: @ext7_v8f32v4f32_use2(
256+ ; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
257+ ; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
258+ ; CHECK-NEXT: call void @use(float [[N]])
259+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3
260+ ; CHECK-NEXT: ret <4 x float> [[R]]
261+ ;
262+ %e = extractelement <8 x float > %x , i32 7
263+ %n = fneg float %e
264+ call void @use (float %n )
265+ %r = insertelement <4 x float > %y , float %n , i32 3
266+ ret <4 x float > %r
267+ }
268+
190269; Negative test - can't convert variable index to a shuffle.
191270
192271define <2 x double > @ext_index_var_v2f64 (<2 x double > %x , <2 x double > %y , i32 %index ) {
@@ -215,14 +294,10 @@ define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y,
215294 ret <4 x double > %r
216295}
217296
218- ; Negative test - require same extract/insert index for simple shuffle.
219- ; TODO: We could handle this by adjusting the cost calculation.
220-
221297define <2 x double > @ext1_v2f64_ins0 (<2 x double > %x , <2 x double > %y ) {
222298; CHECK-LABEL: @ext1_v2f64_ins0(
223- ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
224- ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
225- ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 0
299+ ; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
300+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 3, i32 1>
226301; CHECK-NEXT: ret <2 x double> [[R]]
227302;
228303 %e = extractelement <2 x double > %x , i32 1
@@ -231,12 +306,11 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
231306 ret <2 x double > %r
232307}
233308
234- ; Negative test - extract from an index greater than the vector width of the destination
235309define <2 x double > @ext3_v4f64v2f64 (<4 x double > %x , <2 x double > %y ) {
236310; CHECK-LABEL: @ext3_v4f64v2f64(
237- ; CHECK-NEXT: [[E :%.*]] = extractelement <4 x double> [[X:%.*]], i32 3
238- ; CHECK-NEXT: [[N :%.*]] = fneg nsz double [[E]]
239- ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N ]], i32 1
311+ ; CHECK-NEXT: [[TMP1 :%.*]] = fneg nsz <4 x double> [[X:%.*]]
312+ ; CHECK-NEXT: [[TMP2 :%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
313+ ; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP2 ]], <2 x i32> <i32 0, i32 3>
240314; CHECK-NEXT: ret <2 x double> [[R]]
241315;
242316 %e = extractelement <4 x double > %x , i32 3
@@ -246,11 +320,17 @@ define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
246320}
247321
248322define <4 x double > @ext1_v2f64v4f64_ins0 (<2 x double > %x , <4 x double > %y ) {
249- ; CHECK-LABEL: @ext1_v2f64v4f64_ins0(
250- ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
251- ; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
252- ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
253- ; CHECK-NEXT: ret <4 x double> [[R]]
323+ ; SSE-LABEL: @ext1_v2f64v4f64_ins0(
324+ ; SSE-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
325+ ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
326+ ; SSE-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 5, i32 1, i32 2, i32 3>
327+ ; SSE-NEXT: ret <4 x double> [[R]]
328+ ;
329+ ; AVX-LABEL: @ext1_v2f64v4f64_ins0(
330+ ; AVX-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
331+ ; AVX-NEXT: [[N:%.*]] = fneg nsz double [[E]]
332+ ; AVX-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
333+ ; AVX-NEXT: ret <4 x double> [[R]]
254334;
255335 %e = extractelement <2 x double > %x , i32 1
256336 %n = fneg nsz double %e
0 commit comments