Skip to content

Commit 1111ad4

Browse files
committed
regression
1 parent bcc9a7d commit 1111ad4

File tree

6 files changed

+556
-525
lines changed

6 files changed

+556
-525
lines changed

llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -138,22 +138,25 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
138138
; SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
139139
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [683,u,819,u]
140140
; SSE2-NEXT: pmuludq %xmm1, %xmm0
141-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
141+
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1024,2048,2048,2]
142+
; SSE2-NEXT: pmuludq %xmm0, %xmm2
143+
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
142144
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
143145
; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7
144146
; SSE2-NEXT: movd %eax, %xmm3
145147
; SSE2-NEXT: pmuludq %xmm1, %xmm3
146148
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
147-
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
149+
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2048,u,2,u]
150+
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
151+
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
152+
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
153+
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
148154
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
149-
; SSE2-NEXT: pxor %xmm3, %xmm3
150-
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
151155
; SSE2-NEXT: pand %xmm1, %xmm0
152156
; SSE2-NEXT: psrld $1, %xmm0
153-
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
154-
; SSE2-NEXT: pslld $10, %xmm3
155-
; SSE2-NEXT: por %xmm2, %xmm3
156-
; SSE2-NEXT: pand %xmm1, %xmm3
157+
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
158+
; SSE2-NEXT: orps %xmm2, %xmm3
159+
; SSE2-NEXT: andps %xmm1, %xmm3
157160
; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
158161
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
159162
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax

llvm/test/CodeGen/X86/vector-fshl-sub128.ll

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -499,11 +499,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
499499
; SSE2-NEXT: psrld $28, %xmm1
500500
; SSE2-NEXT: psrld $27, %xmm2
501501
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
502-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
503-
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
504-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
505-
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,1,u]
506-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
502+
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
503+
; SSE2-NEXT: pslld $4, %xmm0
504+
; SSE2-NEXT: pslld $5, %xmm2
507505
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
508506
; SSE2-NEXT: por %xmm1, %xmm0
509507
; SSE2-NEXT: retq
@@ -514,7 +512,10 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
514512
; SSE41-NEXT: psrld $27, %xmm2
515513
; SSE41-NEXT: psrld $28, %xmm1
516514
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
517-
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
515+
; SSE41-NEXT: movdqa %xmm0, %xmm1
516+
; SSE41-NEXT: pslld $5, %xmm1
517+
; SSE41-NEXT: pslld $4, %xmm0
518+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
518519
; SSE41-NEXT: por %xmm2, %xmm0
519520
; SSE41-NEXT: retq
520521
;
@@ -523,7 +524,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
523524
; AVX1-NEXT: vpsrld $27, %xmm1, %xmm2
524525
; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
525526
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
526-
; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
527+
; AVX1-NEXT: vpslld $5, %xmm0, %xmm2
528+
; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
529+
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
527530
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
528531
; AVX1-NEXT: retq
529532
;
@@ -597,11 +600,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
597600
; X86-SSE2-NEXT: psrld $28, %xmm1
598601
; X86-SSE2-NEXT: psrld $27, %xmm2
599602
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
600-
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
601-
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
602-
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
603-
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,1,u]
604-
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
603+
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
604+
; X86-SSE2-NEXT: pslld $4, %xmm0
605+
; X86-SSE2-NEXT: pslld $5, %xmm2
605606
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
606607
; X86-SSE2-NEXT: por %xmm1, %xmm0
607608
; X86-SSE2-NEXT: retl

llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll

Lines changed: 64 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -196,18 +196,18 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
196196
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
197197
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
198198
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
199-
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,3,4,5,6,7]
199+
; SSE-NEXT: movq %xmm2, (%rsi)
200+
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,3,2,3,4,5,6,7]
200201
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
201202
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
202203
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
203204
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
204205
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
205206
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
207+
; SSE-NEXT: movq %xmm1, (%rdx)
206208
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
207209
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
208-
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
209-
; SSE-NEXT: movq %xmm2, (%rsi)
210-
; SSE-NEXT: movq %xmm1, (%rdx)
210+
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
211211
; SSE-NEXT: movq %xmm0, (%rcx)
212212
; SSE-NEXT: retq
213213
;
@@ -217,14 +217,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
217217
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
218218
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
219219
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
220-
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
221-
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
220+
; AVX-NEXT: vmovq %xmm2, (%rsi)
221+
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
222+
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
223+
; AVX-NEXT: vmovq %xmm2, (%rdx)
222224
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
223225
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
224226
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
225227
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
226-
; AVX-NEXT: vmovq %xmm2, (%rsi)
227-
; AVX-NEXT: vmovq %xmm3, (%rdx)
228228
; AVX-NEXT: vmovq %xmm0, (%rcx)
229229
; AVX-NEXT: retq
230230
;
@@ -234,14 +234,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
234234
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
235235
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
236236
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
237-
; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
238-
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
237+
; AVX2-NEXT: vmovq %xmm2, (%rsi)
238+
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
239+
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
240+
; AVX2-NEXT: vmovq %xmm2, (%rdx)
239241
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
240242
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
241243
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
242244
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
243-
; AVX2-NEXT: vmovq %xmm2, (%rsi)
244-
; AVX2-NEXT: vmovq %xmm3, (%rdx)
245245
; AVX2-NEXT: vmovq %xmm0, (%rcx)
246246
; AVX2-NEXT: retq
247247
;
@@ -251,13 +251,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
251251
; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
252252
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
253253
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
254-
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
255-
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
254+
; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
255+
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
256+
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
257+
; AVX2-FP-NEXT: vmovq %xmm2, (%rdx)
256258
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
257259
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
258260
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
259-
; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
260-
; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
261261
; AVX2-FP-NEXT: vmovq %xmm0, (%rcx)
262262
; AVX2-FP-NEXT: retq
263263
;
@@ -267,13 +267,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
267267
; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
268268
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
269269
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
270-
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
271-
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
270+
; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
271+
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
272+
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
273+
; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx)
272274
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
273275
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
274276
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
275-
; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
276-
; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
277277
; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx)
278278
; AVX2-FCP-NEXT: retq
279279
;
@@ -283,14 +283,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
283283
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
284284
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
285285
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
286-
; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
287-
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
286+
; AVX512-NEXT: vmovq %xmm2, (%rsi)
287+
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
288+
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
289+
; AVX512-NEXT: vmovq %xmm2, (%rdx)
288290
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
289291
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
290292
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
291293
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
292-
; AVX512-NEXT: vmovq %xmm2, (%rsi)
293-
; AVX512-NEXT: vmovq %xmm3, (%rdx)
294294
; AVX512-NEXT: vmovq %xmm0, (%rcx)
295295
; AVX512-NEXT: retq
296296
;
@@ -300,13 +300,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
300300
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
301301
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
302302
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
303-
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
304-
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
303+
; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
304+
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
305+
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
306+
; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx)
305307
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
306308
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
307309
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
308-
; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
309-
; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
310310
; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
311311
; AVX512-FCP-NEXT: retq
312312
;
@@ -316,14 +316,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
316316
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
317317
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
318318
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
319-
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
320-
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
319+
; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
320+
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
321+
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
322+
; AVX512DQ-NEXT: vmovq %xmm2, (%rdx)
321323
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
322324
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
323325
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
324326
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
325-
; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
326-
; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
327327
; AVX512DQ-NEXT: vmovq %xmm0, (%rcx)
328328
; AVX512DQ-NEXT: retq
329329
;
@@ -333,13 +333,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
333333
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
334334
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
335335
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
336-
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
337-
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
336+
; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
337+
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
338+
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
339+
; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx)
338340
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
339341
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
340342
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
341-
; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
342-
; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
343343
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
344344
; AVX512DQ-FCP-NEXT: retq
345345
;
@@ -348,15 +348,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
348348
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
349349
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
350350
; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
351-
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
352-
; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
353351
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7]
354-
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3]
355-
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
356-
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
352+
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3
357353
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
358-
; AVX512BW-NEXT: vmovq %xmm1, (%rdx)
359-
; AVX512BW-NEXT: vmovq %xmm2, (%rcx)
354+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
355+
; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
356+
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
357+
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3]
358+
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
359+
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
360+
; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
360361
; AVX512BW-NEXT: vzeroupper
361362
; AVX512BW-NEXT: retq
362363
;
@@ -365,13 +366,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
365366
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
366367
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
367368
; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
368-
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
369-
; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2
370-
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11]
371-
; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1
372369
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi)
373-
; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
374-
; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rcx)
370+
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
371+
; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
372+
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
373+
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11]
374+
; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
375+
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
375376
; AVX512BW-FCP-NEXT: vzeroupper
376377
; AVX512BW-FCP-NEXT: retq
377378
;
@@ -380,15 +381,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
380381
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
381382
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
382383
; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
383-
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
384-
; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
385384
; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7]
386-
; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3]
387-
; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
388-
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
385+
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm3
389386
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi)
390-
; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx)
391-
; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx)
387+
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
388+
; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
389+
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
390+
; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3]
391+
; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
392+
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
393+
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
392394
; AVX512DQ-BW-NEXT: vzeroupper
393395
; AVX512DQ-BW-NEXT: retq
394396
;
@@ -397,13 +399,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
397399
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
398400
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
399401
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
400-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
401-
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2
402-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11]
403-
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1
404402
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi)
405-
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
406-
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rcx)
403+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
404+
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
405+
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
406+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11]
407+
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
408+
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
407409
; AVX512DQ-BW-FCP-NEXT: vzeroupper
408410
; AVX512DQ-BW-FCP-NEXT: retq
409411
%wide.vec = load <12 x i16>, ptr %in.vec, align 64

0 commit comments

Comments
 (0)