|
4 | 4 | ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 |
5 | 5 | ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP |
6 | 6 | ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP |
7 | | -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 |
8 | | -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP |
9 | | -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ |
10 | | -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP |
11 | | -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW |
12 | | -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP |
13 | | -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW |
14 | | -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP |
| 7 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512-VL |
| 8 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512-FCP |
| 9 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ |
| 10 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ-FCP |
| 11 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW |
| 12 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW-FCP |
| 13 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512DQ-BW |
| 14 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ-BW-FCP |
15 | 15 |
|
16 | 16 | ; These patterns are produced by LoopVectorizer for interleaved loads. |
17 | 17 |
|
@@ -69,69 +69,6 @@ define void @load_i16_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou |
69 | 69 | ; AVX512-NEXT: vmovd %xmm1, (%rsi) |
70 | 70 | ; AVX512-NEXT: vmovd %xmm0, (%rdx) |
71 | 71 | ; AVX512-NEXT: retq |
72 | | -; |
73 | | -; AVX512-FCP-LABEL: load_i16_stride2_vf2: |
74 | | -; AVX512-FCP: # %bb.0: |
75 | | -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
76 | | -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
77 | | -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
78 | | -; AVX512-FCP-NEXT: vmovd %xmm1, (%rsi) |
79 | | -; AVX512-FCP-NEXT: vmovd %xmm0, (%rdx) |
80 | | -; AVX512-FCP-NEXT: retq |
81 | | -; |
82 | | -; AVX512DQ-LABEL: load_i16_stride2_vf2: |
83 | | -; AVX512DQ: # %bb.0: |
84 | | -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
85 | | -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
86 | | -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
87 | | -; AVX512DQ-NEXT: vmovd %xmm1, (%rsi) |
88 | | -; AVX512DQ-NEXT: vmovd %xmm0, (%rdx) |
89 | | -; AVX512DQ-NEXT: retq |
90 | | -; |
91 | | -; AVX512DQ-FCP-LABEL: load_i16_stride2_vf2: |
92 | | -; AVX512DQ-FCP: # %bb.0: |
93 | | -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
94 | | -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
95 | | -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
96 | | -; AVX512DQ-FCP-NEXT: vmovd %xmm1, (%rsi) |
97 | | -; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rdx) |
98 | | -; AVX512DQ-FCP-NEXT: retq |
99 | | -; |
100 | | -; AVX512BW-LABEL: load_i16_stride2_vf2: |
101 | | -; AVX512BW: # %bb.0: |
102 | | -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
103 | | -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
104 | | -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
105 | | -; AVX512BW-NEXT: vmovd %xmm1, (%rsi) |
106 | | -; AVX512BW-NEXT: vmovd %xmm0, (%rdx) |
107 | | -; AVX512BW-NEXT: retq |
108 | | -; |
109 | | -; AVX512BW-FCP-LABEL: load_i16_stride2_vf2: |
110 | | -; AVX512BW-FCP: # %bb.0: |
111 | | -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
112 | | -; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
113 | | -; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
114 | | -; AVX512BW-FCP-NEXT: vmovd %xmm1, (%rsi) |
115 | | -; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rdx) |
116 | | -; AVX512BW-FCP-NEXT: retq |
117 | | -; |
118 | | -; AVX512DQ-BW-LABEL: load_i16_stride2_vf2: |
119 | | -; AVX512DQ-BW: # %bb.0: |
120 | | -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
121 | | -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
122 | | -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
123 | | -; AVX512DQ-BW-NEXT: vmovd %xmm1, (%rsi) |
124 | | -; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rdx) |
125 | | -; AVX512DQ-BW-NEXT: retq |
126 | | -; |
127 | | -; AVX512DQ-BW-FCP-LABEL: load_i16_stride2_vf2: |
128 | | -; AVX512DQ-BW-FCP: # %bb.0: |
129 | | -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
130 | | -; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
131 | | -; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
132 | | -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%rsi) |
133 | | -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rdx) |
134 | | -; AVX512DQ-BW-FCP-NEXT: retq |
135 | 72 | %wide.vec = load <4 x i16>, ptr %in.vec, align 64 |
136 | 73 | %strided.vec0 = shufflevector <4 x i16> %wide.vec, <4 x i16> poison, <2 x i32> <i32 0, i32 2> |
137 | 74 | %strided.vec1 = shufflevector <4 x i16> %wide.vec, <4 x i16> poison, <2 x i32> <i32 1, i32 3> |
@@ -198,62 +135,6 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou |
198 | 135 | ; AVX512-NEXT: vpmovdw %xmm0, (%rsi) |
199 | 136 | ; AVX512-NEXT: vmovq %xmm1, (%rdx) |
200 | 137 | ; AVX512-NEXT: retq |
201 | | -; |
202 | | -; AVX512-FCP-LABEL: load_i16_stride2_vf4: |
203 | | -; AVX512-FCP: # %bb.0: |
204 | | -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 |
205 | | -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
206 | | -; AVX512-FCP-NEXT: vpmovdw %xmm0, (%rsi) |
207 | | -; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx) |
208 | | -; AVX512-FCP-NEXT: retq |
209 | | -; |
210 | | -; AVX512DQ-LABEL: load_i16_stride2_vf4: |
211 | | -; AVX512DQ: # %bb.0: |
212 | | -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 |
213 | | -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
214 | | -; AVX512DQ-NEXT: vpmovdw %xmm0, (%rsi) |
215 | | -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) |
216 | | -; AVX512DQ-NEXT: retq |
217 | | -; |
218 | | -; AVX512DQ-FCP-LABEL: load_i16_stride2_vf4: |
219 | | -; AVX512DQ-FCP: # %bb.0: |
220 | | -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 |
221 | | -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
222 | | -; AVX512DQ-FCP-NEXT: vpmovdw %xmm0, (%rsi) |
223 | | -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx) |
224 | | -; AVX512DQ-FCP-NEXT: retq |
225 | | -; |
226 | | -; AVX512BW-LABEL: load_i16_stride2_vf4: |
227 | | -; AVX512BW: # %bb.0: |
228 | | -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 |
229 | | -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
230 | | -; AVX512BW-NEXT: vpmovdw %xmm0, (%rsi) |
231 | | -; AVX512BW-NEXT: vmovq %xmm1, (%rdx) |
232 | | -; AVX512BW-NEXT: retq |
233 | | -; |
234 | | -; AVX512BW-FCP-LABEL: load_i16_stride2_vf4: |
235 | | -; AVX512BW-FCP: # %bb.0: |
236 | | -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 |
237 | | -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
238 | | -; AVX512BW-FCP-NEXT: vpmovdw %xmm0, (%rsi) |
239 | | -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rdx) |
240 | | -; AVX512BW-FCP-NEXT: retq |
241 | | -; |
242 | | -; AVX512DQ-BW-LABEL: load_i16_stride2_vf4: |
243 | | -; AVX512DQ-BW: # %bb.0: |
244 | | -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 |
245 | | -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
246 | | -; AVX512DQ-BW-NEXT: vpmovdw %xmm0, (%rsi) |
247 | | -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) |
248 | | -; AVX512DQ-BW-NEXT: retq |
249 | | -; |
250 | | -; AVX512DQ-BW-FCP-LABEL: load_i16_stride2_vf4: |
251 | | -; AVX512DQ-BW-FCP: # %bb.0: |
252 | | -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 |
253 | | -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
254 | | -; AVX512DQ-BW-FCP-NEXT: vpmovdw %xmm0, (%rsi) |
255 | | -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rdx) |
256 | | -; AVX512DQ-BW-FCP-NEXT: retq |
257 | 138 | %wide.vec = load <8 x i16>, ptr %in.vec, align 64 |
258 | 139 | %strided.vec0 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> |
259 | 140 | %strided.vec1 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> |
@@ -349,69 +230,6 @@ define void @load_i16_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou |
349 | 230 | ; AVX512-NEXT: vpmovdw %ymm1, (%rdx) |
350 | 231 | ; AVX512-NEXT: vzeroupper |
351 | 232 | ; AVX512-NEXT: retq |
352 | | -; |
353 | | -; AVX512-FCP-LABEL: load_i16_stride2_vf8: |
354 | | -; AVX512-FCP: # %bb.0: |
355 | | -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 |
356 | | -; AVX512-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 |
357 | | -; AVX512-FCP-NEXT: vpmovdw %ymm0, (%rsi) |
358 | | -; AVX512-FCP-NEXT: vpmovdw %ymm1, (%rdx) |
359 | | -; AVX512-FCP-NEXT: vzeroupper |
360 | | -; AVX512-FCP-NEXT: retq |
361 | | -; |
362 | | -; AVX512DQ-LABEL: load_i16_stride2_vf8: |
363 | | -; AVX512DQ: # %bb.0: |
364 | | -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 |
365 | | -; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm1 |
366 | | -; AVX512DQ-NEXT: vpmovdw %ymm0, (%rsi) |
367 | | -; AVX512DQ-NEXT: vpmovdw %ymm1, (%rdx) |
368 | | -; AVX512DQ-NEXT: vzeroupper |
369 | | -; AVX512DQ-NEXT: retq |
370 | | -; |
371 | | -; AVX512DQ-FCP-LABEL: load_i16_stride2_vf8: |
372 | | -; AVX512DQ-FCP: # %bb.0: |
373 | | -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 |
374 | | -; AVX512DQ-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 |
375 | | -; AVX512DQ-FCP-NEXT: vpmovdw %ymm0, (%rsi) |
376 | | -; AVX512DQ-FCP-NEXT: vpmovdw %ymm1, (%rdx) |
377 | | -; AVX512DQ-FCP-NEXT: vzeroupper |
378 | | -; AVX512DQ-FCP-NEXT: retq |
379 | | -; |
380 | | -; AVX512BW-LABEL: load_i16_stride2_vf8: |
381 | | -; AVX512BW: # %bb.0: |
382 | | -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 |
383 | | -; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm1 |
384 | | -; AVX512BW-NEXT: vpmovdw %ymm0, (%rsi) |
385 | | -; AVX512BW-NEXT: vpmovdw %ymm1, (%rdx) |
386 | | -; AVX512BW-NEXT: vzeroupper |
387 | | -; AVX512BW-NEXT: retq |
388 | | -; |
389 | | -; AVX512BW-FCP-LABEL: load_i16_stride2_vf8: |
390 | | -; AVX512BW-FCP: # %bb.0: |
391 | | -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 |
392 | | -; AVX512BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 |
393 | | -; AVX512BW-FCP-NEXT: vpmovdw %ymm0, (%rsi) |
394 | | -; AVX512BW-FCP-NEXT: vpmovdw %ymm1, (%rdx) |
395 | | -; AVX512BW-FCP-NEXT: vzeroupper |
396 | | -; AVX512BW-FCP-NEXT: retq |
397 | | -; |
398 | | -; AVX512DQ-BW-LABEL: load_i16_stride2_vf8: |
399 | | -; AVX512DQ-BW: # %bb.0: |
400 | | -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 |
401 | | -; AVX512DQ-BW-NEXT: vpsrld $16, %ymm0, %ymm1 |
402 | | -; AVX512DQ-BW-NEXT: vpmovdw %ymm0, (%rsi) |
403 | | -; AVX512DQ-BW-NEXT: vpmovdw %ymm1, (%rdx) |
404 | | -; AVX512DQ-BW-NEXT: vzeroupper |
405 | | -; AVX512DQ-BW-NEXT: retq |
406 | | -; |
407 | | -; AVX512DQ-BW-FCP-LABEL: load_i16_stride2_vf8: |
408 | | -; AVX512DQ-BW-FCP: # %bb.0: |
409 | | -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 |
410 | | -; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 |
411 | | -; AVX512DQ-BW-FCP-NEXT: vpmovdw %ymm0, (%rsi) |
412 | | -; AVX512DQ-BW-FCP-NEXT: vpmovdw %ymm1, (%rdx) |
413 | | -; AVX512DQ-BW-FCP-NEXT: vzeroupper |
414 | | -; AVX512DQ-BW-FCP-NEXT: retq |
415 | 233 | %wide.vec = load <16 x i16>, ptr %in.vec, align 64 |
416 | 234 | %strided.vec0 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> |
417 | 235 | %strided.vec1 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> |
@@ -544,69 +362,6 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no |
544 | 362 | ; AVX512-NEXT: vpmovdw %zmm1, (%rdx) |
545 | 363 | ; AVX512-NEXT: vzeroupper |
546 | 364 | ; AVX512-NEXT: retq |
547 | | -; |
548 | | -; AVX512-FCP-LABEL: load_i16_stride2_vf16: |
549 | | -; AVX512-FCP: # %bb.0: |
550 | | -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 |
551 | | -; AVX512-FCP-NEXT: vpsrld $16, %zmm0, %zmm1 |
552 | | -; AVX512-FCP-NEXT: vpmovdw %zmm0, (%rsi) |
553 | | -; AVX512-FCP-NEXT: vpmovdw %zmm1, (%rdx) |
554 | | -; AVX512-FCP-NEXT: vzeroupper |
555 | | -; AVX512-FCP-NEXT: retq |
556 | | -; |
557 | | -; AVX512DQ-LABEL: load_i16_stride2_vf16: |
558 | | -; AVX512DQ: # %bb.0: |
559 | | -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 |
560 | | -; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 |
561 | | -; AVX512DQ-NEXT: vpmovdw %zmm0, (%rsi) |
562 | | -; AVX512DQ-NEXT: vpmovdw %zmm1, (%rdx) |
563 | | -; AVX512DQ-NEXT: vzeroupper |
564 | | -; AVX512DQ-NEXT: retq |
565 | | -; |
566 | | -; AVX512DQ-FCP-LABEL: load_i16_stride2_vf16: |
567 | | -; AVX512DQ-FCP: # %bb.0: |
568 | | -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 |
569 | | -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm0, %zmm1 |
570 | | -; AVX512DQ-FCP-NEXT: vpmovdw %zmm0, (%rsi) |
571 | | -; AVX512DQ-FCP-NEXT: vpmovdw %zmm1, (%rdx) |
572 | | -; AVX512DQ-FCP-NEXT: vzeroupper |
573 | | -; AVX512DQ-FCP-NEXT: retq |
574 | | -; |
575 | | -; AVX512BW-LABEL: load_i16_stride2_vf16: |
576 | | -; AVX512BW: # %bb.0: |
577 | | -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 |
578 | | -; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 |
579 | | -; AVX512BW-NEXT: vpmovdw %zmm0, (%rsi) |
580 | | -; AVX512BW-NEXT: vpmovdw %zmm1, (%rdx) |
581 | | -; AVX512BW-NEXT: vzeroupper |
582 | | -; AVX512BW-NEXT: retq |
583 | | -; |
584 | | -; AVX512BW-FCP-LABEL: load_i16_stride2_vf16: |
585 | | -; AVX512BW-FCP: # %bb.0: |
586 | | -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 |
587 | | -; AVX512BW-FCP-NEXT: vpsrld $16, %zmm0, %zmm1 |
588 | | -; AVX512BW-FCP-NEXT: vpmovdw %zmm0, (%rsi) |
589 | | -; AVX512BW-FCP-NEXT: vpmovdw %zmm1, (%rdx) |
590 | | -; AVX512BW-FCP-NEXT: vzeroupper |
591 | | -; AVX512BW-FCP-NEXT: retq |
592 | | -; |
593 | | -; AVX512DQ-BW-LABEL: load_i16_stride2_vf16: |
594 | | -; AVX512DQ-BW: # %bb.0: |
595 | | -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 |
596 | | -; AVX512DQ-BW-NEXT: vpsrld $16, %zmm0, %zmm1 |
597 | | -; AVX512DQ-BW-NEXT: vpmovdw %zmm0, (%rsi) |
598 | | -; AVX512DQ-BW-NEXT: vpmovdw %zmm1, (%rdx) |
599 | | -; AVX512DQ-BW-NEXT: vzeroupper |
600 | | -; AVX512DQ-BW-NEXT: retq |
601 | | -; |
602 | | -; AVX512DQ-BW-FCP-LABEL: load_i16_stride2_vf16: |
603 | | -; AVX512DQ-BW-FCP: # %bb.0: |
604 | | -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 |
605 | | -; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %zmm0, %zmm1 |
606 | | -; AVX512DQ-BW-FCP-NEXT: vpmovdw %zmm0, (%rsi) |
607 | | -; AVX512DQ-BW-FCP-NEXT: vpmovdw %zmm1, (%rdx) |
608 | | -; AVX512DQ-BW-FCP-NEXT: vzeroupper |
609 | | -; AVX512DQ-BW-FCP-NEXT: retq |
610 | 365 | %wide.vec = load <32 x i16>, ptr %in.vec, align 64 |
611 | 366 | %strided.vec0 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> |
612 | 367 | %strided.vec1 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> |
@@ -817,18 +572,18 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no |
817 | 572 | ; AVX2-FCP-NEXT: vzeroupper |
818 | 573 | ; AVX2-FCP-NEXT: retq |
819 | 574 | ; |
820 | | -; AVX512-LABEL: load_i16_stride2_vf32: |
821 | | -; AVX512: # %bb.0: |
822 | | -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 |
823 | | -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 |
824 | | -; AVX512-NEXT: vpsrld $16, %zmm0, %zmm2 |
825 | | -; AVX512-NEXT: vpsrld $16, %zmm1, %zmm3 |
826 | | -; AVX512-NEXT: vpmovdw %zmm1, 32(%rsi) |
827 | | -; AVX512-NEXT: vpmovdw %zmm0, (%rsi) |
828 | | -; AVX512-NEXT: vpmovdw %zmm3, 32(%rdx) |
829 | | -; AVX512-NEXT: vpmovdw %zmm2, (%rdx) |
830 | | -; AVX512-NEXT: vzeroupper |
831 | | -; AVX512-NEXT: retq |
| 575 | +; AVX512-VL-LABEL: load_i16_stride2_vf32: |
| 576 | +; AVX512-VL: # %bb.0: |
| 577 | +; AVX512-VL-NEXT: vmovdqa64 (%rdi), %zmm0 |
| 578 | +; AVX512-VL-NEXT: vmovdqa64 64(%rdi), %zmm1 |
| 579 | +; AVX512-VL-NEXT: vpsrld $16, %zmm0, %zmm2 |
| 580 | +; AVX512-VL-NEXT: vpsrld $16, %zmm1, %zmm3 |
| 581 | +; AVX512-VL-NEXT: vpmovdw %zmm1, 32(%rsi) |
| 582 | +; AVX512-VL-NEXT: vpmovdw %zmm0, (%rsi) |
| 583 | +; AVX512-VL-NEXT: vpmovdw %zmm3, 32(%rdx) |
| 584 | +; AVX512-VL-NEXT: vpmovdw %zmm2, (%rdx) |
| 585 | +; AVX512-VL-NEXT: vzeroupper |
| 586 | +; AVX512-VL-NEXT: retq |
832 | 587 | ; |
833 | 588 | ; AVX512-FCP-LABEL: load_i16_stride2_vf32: |
834 | 589 | ; AVX512-FCP: # %bb.0: |
@@ -1344,27 +1099,27 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no |
1344 | 1099 | ; AVX2-FCP-NEXT: vzeroupper |
1345 | 1100 | ; AVX2-FCP-NEXT: retq |
1346 | 1101 | ; |
1347 | | -; AVX512-LABEL: load_i16_stride2_vf64: |
1348 | | -; AVX512: # %bb.0: |
1349 | | -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 |
1350 | | -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 |
1351 | | -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 |
1352 | | -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 |
1353 | | -; AVX512-NEXT: vpmovdw %zmm1, %ymm4 |
1354 | | -; AVX512-NEXT: vpsrld $16, %zmm1, %zmm1 |
1355 | | -; AVX512-NEXT: vpsrld $16, %zmm0, %zmm5 |
1356 | | -; AVX512-NEXT: vpsrld $16, %zmm3, %zmm6 |
1357 | | -; AVX512-NEXT: vpsrld $16, %zmm2, %zmm7 |
1358 | | -; AVX512-NEXT: vpmovdw %zmm0, (%rsi) |
1359 | | -; AVX512-NEXT: vmovdqa %ymm4, 32(%rsi) |
1360 | | -; AVX512-NEXT: vpmovdw %zmm2, 64(%rsi) |
1361 | | -; AVX512-NEXT: vpmovdw %zmm3, 96(%rsi) |
1362 | | -; AVX512-NEXT: vpmovdw %zmm7, 64(%rdx) |
1363 | | -; AVX512-NEXT: vpmovdw %zmm6, 96(%rdx) |
1364 | | -; AVX512-NEXT: vpmovdw %zmm5, (%rdx) |
1365 | | -; AVX512-NEXT: vpmovdw %zmm1, 32(%rdx) |
1366 | | -; AVX512-NEXT: vzeroupper |
1367 | | -; AVX512-NEXT: retq |
| 1102 | +; AVX512-VL-LABEL: load_i16_stride2_vf64: |
| 1103 | +; AVX512-VL: # %bb.0: |
| 1104 | +; AVX512-VL-NEXT: vmovdqa64 (%rdi), %zmm0 |
| 1105 | +; AVX512-VL-NEXT: vmovdqa64 64(%rdi), %zmm1 |
| 1106 | +; AVX512-VL-NEXT: vmovdqa64 128(%rdi), %zmm2 |
| 1107 | +; AVX512-VL-NEXT: vmovdqa64 192(%rdi), %zmm3 |
| 1108 | +; AVX512-VL-NEXT: vpmovdw %zmm1, %ymm4 |
| 1109 | +; AVX512-VL-NEXT: vpsrld $16, %zmm1, %zmm1 |
| 1110 | +; AVX512-VL-NEXT: vpsrld $16, %zmm0, %zmm5 |
| 1111 | +; AVX512-VL-NEXT: vpsrld $16, %zmm3, %zmm6 |
| 1112 | +; AVX512-VL-NEXT: vpsrld $16, %zmm2, %zmm7 |
| 1113 | +; AVX512-VL-NEXT: vpmovdw %zmm0, (%rsi) |
| 1114 | +; AVX512-VL-NEXT: vmovdqa %ymm4, 32(%rsi) |
| 1115 | +; AVX512-VL-NEXT: vpmovdw %zmm2, 64(%rsi) |
| 1116 | +; AVX512-VL-NEXT: vpmovdw %zmm3, 96(%rsi) |
| 1117 | +; AVX512-VL-NEXT: vpmovdw %zmm7, 64(%rdx) |
| 1118 | +; AVX512-VL-NEXT: vpmovdw %zmm6, 96(%rdx) |
| 1119 | +; AVX512-VL-NEXT: vpmovdw %zmm5, (%rdx) |
| 1120 | +; AVX512-VL-NEXT: vpmovdw %zmm1, 32(%rdx) |
| 1121 | +; AVX512-VL-NEXT: vzeroupper |
| 1122 | +; AVX512-VL-NEXT: retq |
1368 | 1123 | ; |
1369 | 1124 | ; AVX512-FCP-LABEL: load_i16_stride2_vf64: |
1370 | 1125 | ; AVX512-FCP: # %bb.0: |
|
0 commit comments