@@ -56,46 +56,28 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a) {
5656;
5757; ZNVER4-LABEL: shl_i512_1:
5858; ZNVER4: # %bb.0:
59- ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm1
60- ; ZNVER4-NEXT: vmovq %xmm0, %rdx
61- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %r9
62- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rax
63- ; ZNVER4-NEXT: vmovq %xmm1, %rcx
64- ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
65- ; ZNVER4-NEXT: shrq $63, %rdx
66- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rsi
67- ; ZNVER4-NEXT: vmovq %xmm1, %rdi
6859; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
69- ; ZNVER4-NEXT: leaq (%rdx,%r9,2), %rdx
70- ; ZNVER4-NEXT: shrq $63, %r9
60+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
61+ ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm2
62+ ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm3
63+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,3,2,3]
64+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
65+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
66+ ; ZNVER4-NEXT: vpshldq $1, %xmm0, %xmm8, %xmm9
7167; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm0
72- ; ZNVER4-NEXT: vmovq %xmm1, %r10
73- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %r8
74- ; ZNVER4-NEXT: leaq (%r9,%r10,2), %r9
75- ; ZNVER4-NEXT: shrq $63, %r10
76- ; ZNVER4-NEXT: vmovq %rdx, %xmm4
77- ; ZNVER4-NEXT: leaq (%r10,%r8,2), %r10
78- ; ZNVER4-NEXT: shrq $63, %r8
79- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
80- ; ZNVER4-NEXT: leaq (%r8,%rdi,2), %r8
81- ; ZNVER4-NEXT: shrq $63, %rdi
82- ; ZNVER4-NEXT: leaq (%rdi,%rsi,2), %rdi
83- ; ZNVER4-NEXT: shrq $63, %rsi
84- ; ZNVER4-NEXT: leaq (%rsi,%rcx,2), %rsi
85- ; ZNVER4-NEXT: shrq $63, %rcx
86- ; ZNVER4-NEXT: vmovq %r8, %xmm3
87- ; ZNVER4-NEXT: leaq (%rcx,%rax,2), %rax
88- ; ZNVER4-NEXT: vmovq %rsi, %xmm2
89- ; ZNVER4-NEXT: vmovq %rax, %xmm1
90- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
91- ; ZNVER4-NEXT: vmovq %rdi, %xmm2
92- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
93- ; ZNVER4-NEXT: vmovq %r10, %xmm3
68+ ; ZNVER4-NEXT: vpshldq $1, %xmm1, %xmm10, %xmm7
69+ ; ZNVER4-NEXT: vpshldq $1, %xmm8, %xmm1, %xmm1
70+ ; ZNVER4-NEXT: vpshldq $1, %xmm2, %xmm5, %xmm6
71+ ; ZNVER4-NEXT: vpshldq $1, %xmm3, %xmm4, %xmm4
72+ ; ZNVER4-NEXT: vpshldq $1, %xmm10, %xmm2, %xmm2
73+ ; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
74+ ; ZNVER4-NEXT: vpshldq $1, %xmm5, %xmm3, %xmm1
75+ ; ZNVER4-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
76+ ; ZNVER4-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7
77+ ; ZNVER4-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4
9478; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
95- ; ZNVER4-NEXT: vmovq %r9, %xmm2
96- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
97- ; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
9879; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
80+ ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6]
9981; ZNVER4-NEXT: retq
10082 %d = bitcast <8 x i64 > %a to i512
10183 %s = shl i512 %d , 1
@@ -157,65 +139,28 @@ define <8 x i64> @lshr_i512_1(<8 x i64> %a) {
157139;
158140; ZNVER4-LABEL: lshr_i512_1:
159141; ZNVER4: # %bb.0:
160- ; ZNVER4-NEXT: pushq %rbx
161- ; ZNVER4-NEXT: .cfi_def_cfa_offset 16
162- ; ZNVER4-NEXT: .cfi_offset %rbx, -16
163142; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
164- ; ZNVER4-NEXT: vmovq %xmm0, %r10
165- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi
166- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx
167- ; ZNVER4-NEXT: vmovq %xmm1, %r9
168- ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
169- ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0
170- ; ZNVER4-NEXT: shrq %r10
171- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax
172- ; ZNVER4-NEXT: vmovq %xmm0, %rdx
173- ; ZNVER4-NEXT: vmovq %xmm1, %rdi
174- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11
175- ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
176- ; ZNVER4-NEXT: movq %rdx, %r8
177- ; ZNVER4-NEXT: shrq %r8
178- ; ZNVER4-NEXT: shlq $63, %rax
179- ; ZNVER4-NEXT: movq %rdi, %rbx
180- ; ZNVER4-NEXT: shrq %rbx
181- ; ZNVER4-NEXT: shlq $63, %rdx
182- ; ZNVER4-NEXT: shlq $63, %rdi
183- ; ZNVER4-NEXT: vpsrlq $1, %xmm0, %xmm0
184- ; ZNVER4-NEXT: orq %r8, %rax
185- ; ZNVER4-NEXT: movq %r11, %r8
186- ; ZNVER4-NEXT: shlq $63, %r8
187- ; ZNVER4-NEXT: shrq %r11
188- ; ZNVER4-NEXT: orq %rbx, %r8
189- ; ZNVER4-NEXT: movq %r9, %rbx
190- ; ZNVER4-NEXT: orq %r11, %rdx
191- ; ZNVER4-NEXT: movq %rsi, %r11
192- ; ZNVER4-NEXT: shrq %r11
193- ; ZNVER4-NEXT: shlq $63, %rbx
194- ; ZNVER4-NEXT: shrq %r9
195- ; ZNVER4-NEXT: shlq $63, %rsi
196- ; ZNVER4-NEXT: vmovq %rax, %xmm4
197- ; ZNVER4-NEXT: orq %r11, %rbx
198- ; ZNVER4-NEXT: movq %rcx, %r11
199- ; ZNVER4-NEXT: shlq $63, %r11
200- ; ZNVER4-NEXT: shrq %rcx
201- ; ZNVER4-NEXT: orq %r10, %rsi
202- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
203- ; ZNVER4-NEXT: orq %r9, %r11
204- ; ZNVER4-NEXT: orq %rdi, %rcx
205- ; ZNVER4-NEXT: vmovq %rbx, %xmm3
206- ; ZNVER4-NEXT: vmovq %rcx, %xmm1
207- ; ZNVER4-NEXT: vmovq %r11, %xmm2
208- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
209- ; ZNVER4-NEXT: vmovq %rsi, %xmm2
210- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
211- ; ZNVER4-NEXT: vmovq %r8, %xmm3
212- ; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
213- ; ZNVER4-NEXT: vmovq %rdx, %xmm2
214- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
215- ; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
216- ; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
217- ; ZNVER4-NEXT: popq %rbx
218- ; ZNVER4-NEXT: .cfi_def_cfa_offset 8
143+ ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm2
144+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
145+ ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm3
146+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
147+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,3,2,3]
148+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
149+ ; ZNVER4-NEXT: vpshldq $63, %xmm0, %xmm9, %xmm0
150+ ; ZNVER4-NEXT: vpshldq $63, %xmm1, %xmm10, %xmm8
151+ ; ZNVER4-NEXT: vpshldq $63, %xmm2, %xmm6, %xmm7
152+ ; ZNVER4-NEXT: vpshldq $63, %xmm10, %xmm2, %xmm2
153+ ; ZNVER4-NEXT: vpshldq $63, %xmm9, %xmm1, %xmm1
154+ ; ZNVER4-NEXT: vpshldq $63, %xmm3, %xmm4, %xmm5
155+ ; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
156+ ; ZNVER4-NEXT: vpshldq $63, %xmm6, %xmm3, %xmm2
157+ ; ZNVER4-NEXT: vpsrlq $1, %xmm4, %xmm3
158+ ; ZNVER4-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5
159+ ; ZNVER4-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0
160+ ; ZNVER4-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
161+ ; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
162+ ; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
163+ ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
219164; ZNVER4-NEXT: retq
220165 %d = bitcast <8 x i64 > %a to i512
221166 %s = lshr i512 %d , 1
@@ -277,65 +222,28 @@ define <8 x i64> @ashr_i512_1(<8 x i64> %a) {
277222;
278223; ZNVER4-LABEL: ashr_i512_1:
279224; ZNVER4: # %bb.0:
280- ; ZNVER4-NEXT: pushq %rbx
281- ; ZNVER4-NEXT: .cfi_def_cfa_offset 16
282- ; ZNVER4-NEXT: .cfi_offset %rbx, -16
283225; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
284- ; ZNVER4-NEXT: vmovq %xmm0, %r10
285- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi
286- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx
287- ; ZNVER4-NEXT: vmovq %xmm1, %r9
288- ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
289- ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0
290- ; ZNVER4-NEXT: shrq %r10
291- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax
292- ; ZNVER4-NEXT: vmovq %xmm0, %rdx
293- ; ZNVER4-NEXT: vmovq %xmm1, %rdi
294- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11
295- ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
296- ; ZNVER4-NEXT: movq %rdx, %r8
297- ; ZNVER4-NEXT: shrq %r8
298- ; ZNVER4-NEXT: shlq $63, %rax
299- ; ZNVER4-NEXT: movq %rdi, %rbx
300- ; ZNVER4-NEXT: shrq %rbx
301- ; ZNVER4-NEXT: shlq $63, %rdx
302- ; ZNVER4-NEXT: shlq $63, %rdi
303- ; ZNVER4-NEXT: vpsraq $1, %xmm0, %xmm0
304- ; ZNVER4-NEXT: orq %r8, %rax
305- ; ZNVER4-NEXT: movq %r11, %r8
306- ; ZNVER4-NEXT: shlq $63, %r8
307- ; ZNVER4-NEXT: shrq %r11
308- ; ZNVER4-NEXT: orq %rbx, %r8
309- ; ZNVER4-NEXT: movq %r9, %rbx
310- ; ZNVER4-NEXT: orq %r11, %rdx
311- ; ZNVER4-NEXT: movq %rsi, %r11
312- ; ZNVER4-NEXT: shrq %r11
313- ; ZNVER4-NEXT: shlq $63, %rbx
314- ; ZNVER4-NEXT: shrq %r9
315- ; ZNVER4-NEXT: shlq $63, %rsi
316- ; ZNVER4-NEXT: vmovq %rax, %xmm4
317- ; ZNVER4-NEXT: orq %r11, %rbx
318- ; ZNVER4-NEXT: movq %rcx, %r11
319- ; ZNVER4-NEXT: shlq $63, %r11
320- ; ZNVER4-NEXT: shrq %rcx
321- ; ZNVER4-NEXT: orq %r10, %rsi
322- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
323- ; ZNVER4-NEXT: orq %r9, %r11
324- ; ZNVER4-NEXT: orq %rdi, %rcx
325- ; ZNVER4-NEXT: vmovq %rbx, %xmm3
326- ; ZNVER4-NEXT: vmovq %rcx, %xmm1
327- ; ZNVER4-NEXT: vmovq %r11, %xmm2
328- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
329- ; ZNVER4-NEXT: vmovq %rsi, %xmm2
330- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
331- ; ZNVER4-NEXT: vmovq %r8, %xmm3
332- ; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
333- ; ZNVER4-NEXT: vmovq %rdx, %xmm2
334- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
335- ; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
336- ; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
337- ; ZNVER4-NEXT: popq %rbx
338- ; ZNVER4-NEXT: .cfi_def_cfa_offset 8
226+ ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm2
227+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
228+ ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm3
229+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
230+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,3,2,3]
231+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
232+ ; ZNVER4-NEXT: vpshldq $63, %xmm0, %xmm9, %xmm0
233+ ; ZNVER4-NEXT: vpshldq $63, %xmm1, %xmm10, %xmm8
234+ ; ZNVER4-NEXT: vpshldq $63, %xmm2, %xmm6, %xmm7
235+ ; ZNVER4-NEXT: vpshldq $63, %xmm10, %xmm2, %xmm2
236+ ; ZNVER4-NEXT: vpshldq $63, %xmm9, %xmm1, %xmm1
237+ ; ZNVER4-NEXT: vpshldq $63, %xmm3, %xmm4, %xmm5
238+ ; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
239+ ; ZNVER4-NEXT: vpshldq $63, %xmm6, %xmm3, %xmm2
240+ ; ZNVER4-NEXT: vpsraq $1, %xmm4, %xmm3
241+ ; ZNVER4-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5
242+ ; ZNVER4-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0
243+ ; ZNVER4-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
244+ ; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
245+ ; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
246+ ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
339247; ZNVER4-NEXT: retq
340248 %d = bitcast <8 x i64 > %a to i512
341249 %s = ashr i512 %d , 1
0 commit comments