|
8 | 8 | ; RUN: llc < %s -mtriple=i686-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X86-AVX512 |
9 | 9 | ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64-AVX512 |
10 | 10 |
|
11 | | -; FIXME: PR78897 - Don't vectorize a mul if we still need the extract |
| 11 | +; PR78897 - Don't vectorize a mul of extracted values if we'd still need the extract. |
| 12 | +; TODO: We should vectorize on 32-bit targets. |
12 | 13 | define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind { |
13 | 14 | ; X86-SSE2-LABEL: produceShuffleVectorForByte: |
14 | 15 | ; X86-SSE2: # %bb.0: # %entry |
@@ -70,21 +71,13 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind { |
70 | 71 | ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u] |
71 | 72 | ; X64-SSE2-NEXT: pand %xmm0, %xmm1 |
72 | 73 | ; X64-SSE2-NEXT: movq %xmm1, %rax |
73 | | -; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 |
74 | | -; X64-SSE2-NEXT: psrlq $32, %xmm2 |
75 | | -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440] |
76 | | -; X64-SSE2-NEXT: pmuludq %xmm3, %xmm2 |
77 | | -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [286331153,286331153] |
78 | | -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm4 |
79 | | -; X64-SSE2-NEXT: paddq %xmm2, %xmm4 |
80 | | -; X64-SSE2-NEXT: psllq $32, %xmm4 |
81 | | -; X64-SSE2-NEXT: pmuludq %xmm3, %xmm1 |
82 | | -; X64-SSE2-NEXT: paddq %xmm4, %xmm1 |
83 | | -; X64-SSE2-NEXT: movabsq $76861433640456465, %rcx # imm = 0x111111111111111 |
84 | | -; X64-SSE2-NEXT: xorq %rax, %rcx |
85 | | -; X64-SSE2-NEXT: movabsq $1229782938247303440, %rax # imm = 0x1111111111111110 |
| 74 | +; X64-SSE2-NEXT: movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110 |
| 75 | +; X64-SSE2-NEXT: movabsq $76861433640456465, %rdx # imm = 0x111111111111111 |
| 76 | +; X64-SSE2-NEXT: xorq %rax, %rdx |
86 | 77 | ; X64-SSE2-NEXT: imulq %rcx, %rax |
87 | | -; X64-SSE2-NEXT: movq %rax, %xmm2 |
| 78 | +; X64-SSE2-NEXT: movq %rax, %xmm1 |
| 79 | +; X64-SSE2-NEXT: imulq %rcx, %rdx |
| 80 | +; X64-SSE2-NEXT: movq %rdx, %xmm2 |
88 | 81 | ; X64-SSE2-NEXT: pand %xmm0, %xmm1 |
89 | 82 | ; X64-SSE2-NEXT: pandn %xmm2, %xmm0 |
90 | 83 | ; X64-SSE2-NEXT: por %xmm1, %xmm0 |
@@ -147,24 +140,16 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind { |
147 | 140 | ; X64-SSE42-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 |
148 | 141 | ; X64-SSE42-NEXT: pxor %xmm0, %xmm0 |
149 | 142 | ; X64-SSE42-NEXT: pcmpeqb %xmm1, %xmm0 |
150 | | -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u] |
151 | | -; X64-SSE42-NEXT: pand %xmm0, %xmm2 |
152 | | -; X64-SSE42-NEXT: movq %xmm2, %rax |
153 | | -; X64-SSE42-NEXT: movdqa %xmm2, %xmm1 |
154 | | -; X64-SSE42-NEXT: psrlq $32, %xmm1 |
155 | | -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440] |
156 | | -; X64-SSE42-NEXT: pmuludq %xmm3, %xmm1 |
157 | | -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [286331153,286331153] |
158 | | -; X64-SSE42-NEXT: pmuludq %xmm2, %xmm4 |
159 | | -; X64-SSE42-NEXT: paddq %xmm1, %xmm4 |
160 | | -; X64-SSE42-NEXT: psllq $32, %xmm4 |
161 | | -; X64-SSE42-NEXT: pmuludq %xmm3, %xmm2 |
162 | | -; X64-SSE42-NEXT: paddq %xmm4, %xmm2 |
163 | | -; X64-SSE42-NEXT: movabsq $76861433640456465, %rcx # imm = 0x111111111111111 |
164 | | -; X64-SSE42-NEXT: xorq %rax, %rcx |
165 | | -; X64-SSE42-NEXT: movabsq $1229782938247303440, %rax # imm = 0x1111111111111110 |
| 143 | +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u] |
| 144 | +; X64-SSE42-NEXT: pand %xmm0, %xmm1 |
| 145 | +; X64-SSE42-NEXT: movq %xmm1, %rax |
| 146 | +; X64-SSE42-NEXT: movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110 |
| 147 | +; X64-SSE42-NEXT: movabsq $76861433640456465, %rdx # imm = 0x111111111111111 |
| 148 | +; X64-SSE42-NEXT: xorq %rax, %rdx |
166 | 149 | ; X64-SSE42-NEXT: imulq %rcx, %rax |
167 | | -; X64-SSE42-NEXT: movq %rax, %xmm1 |
| 150 | +; X64-SSE42-NEXT: movq %rax, %xmm2 |
| 151 | +; X64-SSE42-NEXT: imulq %rcx, %rdx |
| 152 | +; X64-SSE42-NEXT: movq %rdx, %xmm1 |
168 | 153 | ; X64-SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 |
169 | 154 | ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 |
170 | 155 | ; X64-SSE42-NEXT: psrlw $4, %xmm0 |
@@ -220,19 +205,13 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind { |
220 | 205 | ; X64-AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 |
221 | 206 | ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 |
222 | 207 | ; X64-AVX2-NEXT: vmovq %xmm1, %rax |
223 | | -; X64-AVX2-NEXT: vpsrlq $32, %xmm1, %xmm2 |
224 | | -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440] |
225 | | -; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 |
226 | | -; X64-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 |
227 | | -; X64-AVX2-NEXT: vpaddq %xmm2, %xmm4, %xmm2 |
228 | | -; X64-AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 |
229 | | -; X64-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 |
230 | | -; X64-AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm1 |
231 | | -; X64-AVX2-NEXT: movabsq $76861433640456465, %rcx # imm = 0x111111111111111 |
232 | | -; X64-AVX2-NEXT: xorq %rax, %rcx |
233 | | -; X64-AVX2-NEXT: movabsq $1229782938247303440, %rax # imm = 0x1111111111111110 |
| 208 | +; X64-AVX2-NEXT: movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110 |
| 209 | +; X64-AVX2-NEXT: movabsq $76861433640456465, %rdx # imm = 0x111111111111111 |
| 210 | +; X64-AVX2-NEXT: xorq %rax, %rdx |
234 | 211 | ; X64-AVX2-NEXT: imulq %rcx, %rax |
235 | | -; X64-AVX2-NEXT: vmovq %rax, %xmm2 |
| 212 | +; X64-AVX2-NEXT: vmovq %rax, %xmm1 |
| 213 | +; X64-AVX2-NEXT: imulq %rcx, %rdx |
| 214 | +; X64-AVX2-NEXT: vmovq %rdx, %xmm2 |
236 | 215 | ; X64-AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 |
237 | 216 | ; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm1 |
238 | 217 | ; X64-AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] |
@@ -280,16 +259,17 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind { |
280 | 259 | ; X64-AVX512-NEXT: vpbroadcastb %edi, %xmm0 |
281 | 260 | ; X64-AVX512-NEXT: vptestnmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 |
282 | 261 | ; X64-AVX512-NEXT: vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z} |
283 | | -; X64-AVX512-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1 |
284 | 262 | ; X64-AVX512-NEXT: vmovq %xmm0, %rax |
285 | | -; X64-AVX512-NEXT: movabsq $76861433640456465, %rcx # imm = 0x111111111111111 |
286 | | -; X64-AVX512-NEXT: xorq %rax, %rcx |
287 | | -; X64-AVX512-NEXT: movabsq $1229782938247303440, %rax # imm = 0x1111111111111110 |
| 263 | +; X64-AVX512-NEXT: movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110 |
| 264 | +; X64-AVX512-NEXT: movabsq $76861433640456465, %rdx # imm = 0x111111111111111 |
| 265 | +; X64-AVX512-NEXT: xorq %rax, %rdx |
288 | 266 | ; X64-AVX512-NEXT: imulq %rcx, %rax |
289 | 267 | ; X64-AVX512-NEXT: vmovq %rax, %xmm0 |
290 | | -; X64-AVX512-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} |
291 | | -; X64-AVX512-NEXT: vpsrlw $4, %xmm0, %xmm1 |
292 | | -; X64-AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] |
| 268 | +; X64-AVX512-NEXT: imulq %rcx, %rdx |
| 269 | +; X64-AVX512-NEXT: vmovq %rdx, %xmm1 |
| 270 | +; X64-AVX512-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} |
| 271 | +; X64-AVX512-NEXT: vpsrlw $4, %xmm1, %xmm0 |
| 272 | +; X64-AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] |
293 | 273 | ; X64-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 |
294 | 274 | ; X64-AVX512-NEXT: retq |
295 | 275 | entry: |
|
0 commit comments