@@ -48,46 +48,20 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a) {
4848;
4949; ZNVER4-LABEL: shl_i512_1:
5050; ZNVER4: # %bb.0:
51- ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm1
52- ; ZNVER4-NEXT: vmovq %xmm0, %rdx
53- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %r9
54- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rax
55- ; ZNVER4-NEXT: vmovq %xmm1, %rcx
5651; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
57- ; ZNVER4-NEXT: shrq $63, %rdx
58- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rsi
59- ; ZNVER4-NEXT: vmovq %xmm1, %rdi
60- ; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
61- ; ZNVER4-NEXT: leaq (%rdx,%r9,2), %rdx
62- ; ZNVER4-NEXT: shrq $63, %r9
63- ; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm0
64- ; ZNVER4-NEXT: vmovq %xmm1, %r10
65- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %r8
66- ; ZNVER4-NEXT: leaq (%r9,%r10,2), %r9
67- ; ZNVER4-NEXT: shrq $63, %r10
68- ; ZNVER4-NEXT: vmovq %rdx, %xmm4
69- ; ZNVER4-NEXT: leaq (%r10,%r8,2), %r10
70- ; ZNVER4-NEXT: shrq $63, %r8
71- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
72- ; ZNVER4-NEXT: leaq (%r8,%rdi,2), %r8
73- ; ZNVER4-NEXT: shrq $63, %rdi
74- ; ZNVER4-NEXT: leaq (%rdi,%rsi,2), %rdi
75- ; ZNVER4-NEXT: shrq $63, %rsi
76- ; ZNVER4-NEXT: leaq (%rsi,%rcx,2), %rsi
77- ; ZNVER4-NEXT: shrq $63, %rcx
78- ; ZNVER4-NEXT: vmovq %r8, %xmm3
79- ; ZNVER4-NEXT: leaq (%rcx,%rax,2), %rax
80- ; ZNVER4-NEXT: vmovq %rsi, %xmm2
81- ; ZNVER4-NEXT: vmovq %rax, %xmm1
82- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
83- ; ZNVER4-NEXT: vmovq %rdi, %xmm2
84- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
85- ; ZNVER4-NEXT: vmovq %r10, %xmm3
52+ ; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm2
53+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
54+ ; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm4
8655; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
87- ; ZNVER4-NEXT: vmovq %r9, %xmm2
88- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
89- ; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
90- ; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
56+ ; ZNVER4-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3
57+ ; ZNVER4-NEXT: vextracti64x4 $1, %zmm0, %ymm2
58+ ; ZNVER4-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
59+ ; ZNVER4-NEXT: vpshldq $1, %ymm1, %ymm2, %ymm1
60+ ; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
61+ ; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
62+ ; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
63+ ; ZNVER4-NEXT: vpshldq $1, %zmm0, %zmm3, %zmm0
64+ ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
9165; ZNVER4-NEXT: retq
9266 %d = bitcast <8 x i64 > %a to i512
9367 %s = shl i512 %d , 1
@@ -142,65 +116,21 @@ define <8 x i64> @lshr_i512_1(<8 x i64> %a) {
142116;
143117; ZNVER4-LABEL: lshr_i512_1:
144118; ZNVER4: # %bb.0:
145- ; ZNVER4-NEXT: pushq %rbx
146- ; ZNVER4-NEXT: .cfi_def_cfa_offset 16
147- ; ZNVER4-NEXT: .cfi_offset %rbx, -16
119+ ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3
148120; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
149- ; ZNVER4-NEXT: vmovq %xmm0, %r10
150- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi
151- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx
152- ; ZNVER4-NEXT: vmovq %xmm1, %r9
153- ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
154- ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0
155- ; ZNVER4-NEXT: shrq %r10
156- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax
157- ; ZNVER4-NEXT: vmovq %xmm0, %rdx
158- ; ZNVER4-NEXT: vmovq %xmm1, %rdi
159- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11
160- ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
161- ; ZNVER4-NEXT: movq %rdx, %r8
162- ; ZNVER4-NEXT: shrq %r8
163- ; ZNVER4-NEXT: shlq $63, %rax
164- ; ZNVER4-NEXT: movq %rdi, %rbx
165- ; ZNVER4-NEXT: shrq %rbx
166- ; ZNVER4-NEXT: shlq $63, %rdx
167- ; ZNVER4-NEXT: shlq $63, %rdi
168- ; ZNVER4-NEXT: vpsrlq $1, %xmm0, %xmm0
169- ; ZNVER4-NEXT: orq %r8, %rax
170- ; ZNVER4-NEXT: movq %r11, %r8
171- ; ZNVER4-NEXT: shlq $63, %r8
172- ; ZNVER4-NEXT: shrq %r11
173- ; ZNVER4-NEXT: orq %rbx, %r8
174- ; ZNVER4-NEXT: movq %r9, %rbx
175- ; ZNVER4-NEXT: orq %r11, %rdx
176- ; ZNVER4-NEXT: movq %rsi, %r11
177- ; ZNVER4-NEXT: shrq %r11
178- ; ZNVER4-NEXT: shlq $63, %rbx
179- ; ZNVER4-NEXT: shrq %r9
180- ; ZNVER4-NEXT: shlq $63, %rsi
181- ; ZNVER4-NEXT: vmovq %rax, %xmm4
182- ; ZNVER4-NEXT: orq %r11, %rbx
183- ; ZNVER4-NEXT: movq %rcx, %r11
184- ; ZNVER4-NEXT: shlq $63, %r11
185- ; ZNVER4-NEXT: shrq %rcx
186- ; ZNVER4-NEXT: orq %r10, %rsi
187- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
188- ; ZNVER4-NEXT: orq %r9, %r11
189- ; ZNVER4-NEXT: orq %rdi, %rcx
190- ; ZNVER4-NEXT: vmovq %rbx, %xmm3
191- ; ZNVER4-NEXT: vmovq %rcx, %xmm1
192- ; ZNVER4-NEXT: vmovq %r11, %xmm2
193- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
194- ; ZNVER4-NEXT: vmovq %rsi, %xmm2
195- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
196- ; ZNVER4-NEXT: vmovq %r8, %xmm3
197- ; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
198- ; ZNVER4-NEXT: vmovq %rdx, %xmm2
199- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
200- ; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
201- ; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
202- ; ZNVER4-NEXT: popq %rbx
203- ; ZNVER4-NEXT: .cfi_def_cfa_offset 8
121+ ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2
122+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
123+ ; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
124+ ; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
125+ ; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
126+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
127+ ; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
128+ ; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
129+ ; ZNVER4-NEXT: vpsrlq $1, %xmm2, %xmm2
130+ ; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0
131+ ; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
132+ ; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
133+ ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
204134; ZNVER4-NEXT: retq
205135 %d = bitcast <8 x i64 > %a to i512
206136 %s = lshr i512 %d , 1
@@ -255,65 +185,21 @@ define <8 x i64> @ashr_i512_1(<8 x i64> %a) {
255185;
256186; ZNVER4-LABEL: ashr_i512_1:
257187; ZNVER4: # %bb.0:
258- ; ZNVER4-NEXT: pushq %rbx
259- ; ZNVER4-NEXT: .cfi_def_cfa_offset 16
260- ; ZNVER4-NEXT: .cfi_offset %rbx, -16
188+ ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3
261189; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
262- ; ZNVER4-NEXT: vmovq %xmm0, %r10
263- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi
264- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx
265- ; ZNVER4-NEXT: vmovq %xmm1, %r9
266- ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
267- ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0
268- ; ZNVER4-NEXT: shrq %r10
269- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax
270- ; ZNVER4-NEXT: vmovq %xmm0, %rdx
271- ; ZNVER4-NEXT: vmovq %xmm1, %rdi
272- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11
273- ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
274- ; ZNVER4-NEXT: movq %rdx, %r8
275- ; ZNVER4-NEXT: shrq %r8
276- ; ZNVER4-NEXT: shlq $63, %rax
277- ; ZNVER4-NEXT: movq %rdi, %rbx
278- ; ZNVER4-NEXT: shrq %rbx
279- ; ZNVER4-NEXT: shlq $63, %rdx
280- ; ZNVER4-NEXT: shlq $63, %rdi
281- ; ZNVER4-NEXT: vpsraq $1, %xmm0, %xmm0
282- ; ZNVER4-NEXT: orq %r8, %rax
283- ; ZNVER4-NEXT: movq %r11, %r8
284- ; ZNVER4-NEXT: shlq $63, %r8
285- ; ZNVER4-NEXT: shrq %r11
286- ; ZNVER4-NEXT: orq %rbx, %r8
287- ; ZNVER4-NEXT: movq %r9, %rbx
288- ; ZNVER4-NEXT: orq %r11, %rdx
289- ; ZNVER4-NEXT: movq %rsi, %r11
290- ; ZNVER4-NEXT: shrq %r11
291- ; ZNVER4-NEXT: shlq $63, %rbx
292- ; ZNVER4-NEXT: shrq %r9
293- ; ZNVER4-NEXT: shlq $63, %rsi
294- ; ZNVER4-NEXT: vmovq %rax, %xmm4
295- ; ZNVER4-NEXT: orq %r11, %rbx
296- ; ZNVER4-NEXT: movq %rcx, %r11
297- ; ZNVER4-NEXT: shlq $63, %r11
298- ; ZNVER4-NEXT: shrq %rcx
299- ; ZNVER4-NEXT: orq %r10, %rsi
300- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
301- ; ZNVER4-NEXT: orq %r9, %r11
302- ; ZNVER4-NEXT: orq %rdi, %rcx
303- ; ZNVER4-NEXT: vmovq %rbx, %xmm3
304- ; ZNVER4-NEXT: vmovq %rcx, %xmm1
305- ; ZNVER4-NEXT: vmovq %r11, %xmm2
306- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
307- ; ZNVER4-NEXT: vmovq %rsi, %xmm2
308- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
309- ; ZNVER4-NEXT: vmovq %r8, %xmm3
310- ; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
311- ; ZNVER4-NEXT: vmovq %rdx, %xmm2
312- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
313- ; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
314- ; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
315- ; ZNVER4-NEXT: popq %rbx
316- ; ZNVER4-NEXT: .cfi_def_cfa_offset 8
190+ ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2
191+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
192+ ; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
193+ ; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
194+ ; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
195+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
196+ ; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
197+ ; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
198+ ; ZNVER4-NEXT: vpsraq $1, %xmm2, %xmm2
199+ ; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0
200+ ; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
201+ ; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
202+ ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
317203; ZNVER4-NEXT: retq
318204 %d = bitcast <8 x i64 > %a to i512
319205 %s = ashr i512 %d , 1
0 commit comments