Skip to content

Commit 07b4396

Browse files
authored
[X86] combineConcatVectorOps - require free concatenation of at least one operand of UNPCKL\H (llvm#135366)
Stop just replacing 2*UNPCK+INSERT_SUBVECTOR with 2*INSERT_SUBVECTOR+UNPCK Currently limited to sub-64-bit element cases until we've accounted for the remaining regressions from some build_vector style patterns.
1 parent 58211f5 commit 07b4396

8 files changed

+864
-851
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58245,17 +58245,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5824558245
case X86ISD::UNPCKL: {
5824658246
// TODO: UNPCK should use CombineSubOperand
5824758247
// Don't concatenate build_vector patterns.
58248-
if (!IsSplat && EltSizeInBits >= 32 &&
58249-
((VT.is256BitVector() && Subtarget.hasInt256()) ||
58250-
(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58248+
if (!IsSplat &&
58249+
((VT.is256BitVector() &&
58250+
(EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58251+
(VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58252+
(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
5825158253
none_of(Ops, [](SDValue Op) {
5825258254
return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
5825358255
ISD::SCALAR_TO_VECTOR ||
5825458256
peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
5825558257
ISD::SCALAR_TO_VECTOR;
5825658258
})) {
58257-
return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58258-
ConcatSubOperand(VT, Ops, 1));
58259+
SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58260+
SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58261+
if (Concat0 || Concat1 ||
58262+
(Subtarget.hasInt256() && EltSizeInBits == 64))
58263+
return DAG.getNode(Opcode, DL, VT,
58264+
Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58265+
Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
5825958266
}
5826058267
break;
5826158268
}

llvm/test/CodeGen/X86/avx512fp16-frem.ll

Lines changed: 337 additions & 333 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-half-conversions.ll

Lines changed: 145 additions & 141 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -130,14 +130,13 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
130130
;
131131
; AVX512BW-FCP-LABEL: store_i16_stride4_vf2:
132132
; AVX512BW-FCP: # %bb.0:
133-
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
134-
; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm1
135-
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,24,0,16,9,25,1,17]
136-
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
137-
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0
138-
; AVX512BW-FCP-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
139-
; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%r8)
140-
; AVX512BW-FCP-NEXT: vzeroupper
133+
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
134+
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
135+
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
136+
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
137+
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
138+
; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2
139+
; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r8)
141140
; AVX512BW-FCP-NEXT: retq
142141
;
143142
; AVX512DQ-BW-LABEL: store_i16_stride4_vf2:
@@ -153,14 +152,13 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
153152
;
154153
; AVX512DQ-BW-FCP-LABEL: store_i16_stride4_vf2:
155154
; AVX512DQ-BW-FCP: # %bb.0:
156-
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
157-
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm1
158-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,24,0,16,9,25,1,17]
159-
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
160-
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0
161-
; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
162-
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%r8)
163-
; AVX512DQ-BW-FCP-NEXT: vzeroupper
155+
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
156+
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
157+
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
158+
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
159+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
160+
; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2
161+
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r8)
164162
; AVX512DQ-BW-FCP-NEXT: retq
165163
%in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64
166164
%in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll

Lines changed: 66 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
8484
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
8585
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
8686
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
87-
; AVX2-NEXT: vmovdqa (%rsi), %xmm1
88-
; AVX2-NEXT: vmovdqa (%rdx), %xmm2
87+
; AVX2-NEXT: vmovdqa (%rdx), %xmm1
88+
; AVX2-NEXT: vmovdqa (%r8), %xmm2
89+
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
90+
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
8991
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
90-
; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
91-
; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
92-
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
93-
; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
92+
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
93+
; AVX2-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
9494
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
9595
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
9696
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -109,13 +109,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
109109
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
110110
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
111111
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
112-
; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
113-
; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
112+
; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
113+
; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
114+
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
115+
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
114116
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
115-
; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
116-
; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
117-
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
118-
; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
117+
; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
118+
; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
119119
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
120120
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
121121
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -134,13 +134,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
134134
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
135135
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
136136
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
137-
; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
138-
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
137+
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
138+
; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
139+
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
140+
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
139141
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
140-
; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
141-
; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
142-
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
143-
; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
142+
; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
143+
; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
144144
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
145145
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
146146
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -159,13 +159,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
159159
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
160160
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
161161
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
162-
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
163-
; AVX512-NEXT: vmovdqa (%rdx), %xmm2
162+
; AVX512-NEXT: vmovdqa (%rdx), %xmm1
163+
; AVX512-NEXT: vmovdqa (%r8), %xmm2
164+
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
165+
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
164166
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
165-
; AVX512-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
166-
; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
167-
; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
168-
; AVX512-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
167+
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
168+
; AVX512-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
169169
; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
170170
; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
171171
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -184,13 +184,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
184184
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
185185
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
186186
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
187-
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
188-
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
187+
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
188+
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
189+
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
190+
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
189191
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
190-
; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
191-
; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
192-
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
193-
; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
192+
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
193+
; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
194194
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
195195
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
196196
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -209,13 +209,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
209209
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
210210
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
211211
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
212-
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
213-
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
212+
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
213+
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
214+
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
215+
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
214216
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
215-
; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
216-
; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
217-
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
218-
; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
217+
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
218+
; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
219219
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
220220
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
221221
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -234,13 +234,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
234234
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
235235
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
236236
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
237-
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
238-
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
237+
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
238+
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
239+
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
240+
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
239241
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
240-
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
241-
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
242-
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
243-
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
242+
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
243+
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
244244
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
245245
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
246246
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -259,13 +259,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
259259
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
260260
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
261261
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
262-
; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
263-
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
262+
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
263+
; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
264+
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
265+
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
264266
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
265-
; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
266-
; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
267-
; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
268-
; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
267+
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
268+
; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
269269
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
270270
; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
271271
; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
@@ -280,13 +280,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
280280
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
281281
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
282282
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
283-
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
284-
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
283+
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
284+
; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
285+
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
286+
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
285287
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
286-
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
287-
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
288-
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
289-
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
288+
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
289+
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
290290
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
291291
; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
292292
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
@@ -301,13 +301,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
301301
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
302302
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
303303
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
304-
; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
305-
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
304+
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
305+
; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
306+
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
307+
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
306308
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
307-
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
308-
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
309-
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
310-
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
309+
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
310+
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
311311
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
312312
; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
313313
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0
@@ -322,13 +322,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
322322
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
323323
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
324324
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
325-
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
326-
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
325+
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
326+
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
327+
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
328+
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
327329
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
328-
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
329-
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
330-
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
331-
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
330+
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
331+
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
332332
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
333333
; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
334334
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0

0 commit comments

Comments
 (0)