@@ -84,13 +84,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
8484; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
8585; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
8686; AVX2-NEXT: vmovdqa (%rdi), %xmm0
87- ; AVX2-NEXT: vmovdqa (%rsi), %xmm1
88- ; AVX2-NEXT: vmovdqa (%rdx), %xmm2
87+ ; AVX2-NEXT: vmovdqa (%rdx), %xmm1
88+ ; AVX2-NEXT: vmovdqa (%r8), %xmm2
89+ ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
90+ ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
8991; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
90- ; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
91- ; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
92- ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
93- ; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
92+ ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
93+ ; AVX2-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
9494; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
9595; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
9696; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -109,13 +109,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
109109; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
110110; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
111111; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
112- ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
113- ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
112+ ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
113+ ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
114+ ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
115+ ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
114116; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
115- ; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
116- ; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
117- ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
118- ; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
117+ ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
118+ ; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
119119; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
120120; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
121121; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -134,13 +134,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
134134; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
135135; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
136136; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
137- ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
138- ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
137+ ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
138+ ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
139+ ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
140+ ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
139141; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
140- ; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
141- ; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
142- ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
143- ; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
142+ ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
143+ ; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
144144; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
145145; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
146146; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -159,13 +159,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
159159; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
160160; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
161161; AVX512-NEXT: vmovdqa (%rdi), %xmm0
162- ; AVX512-NEXT: vmovdqa (%rsi), %xmm1
163- ; AVX512-NEXT: vmovdqa (%rdx), %xmm2
162+ ; AVX512-NEXT: vmovdqa (%rdx), %xmm1
163+ ; AVX512-NEXT: vmovdqa (%r8), %xmm2
164+ ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
165+ ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
164166; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
165- ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
166- ; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
167- ; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
168- ; AVX512-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
167+ ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
168+ ; AVX512-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
169169; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
170170; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
171171; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -184,13 +184,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
184184; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
185185; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
186186; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
187- ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
188- ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
187+ ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
188+ ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
189+ ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
190+ ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
189191; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
190- ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
191- ; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
192- ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
193- ; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
192+ ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
193+ ; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
194194; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
195195; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
196196; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -209,13 +209,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
209209; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
210210; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
211211; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
212- ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
213- ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
212+ ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
213+ ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
214+ ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
215+ ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
214216; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
215- ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
216- ; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
217- ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
218- ; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
217+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
218+ ; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
219219; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
220220; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
221221; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -234,13 +234,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
234234; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
235235; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
236236; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
237- ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
238- ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
237+ ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
238+ ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
239+ ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
240+ ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
239241; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
240- ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
241- ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
242- ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
243- ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
242+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
243+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
244244; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
245245; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
246246; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -259,13 +259,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
259259; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
260260; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
261261; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
262- ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
263- ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
262+ ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
263+ ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
264+ ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
265+ ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
264266; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
265- ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
266- ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
267- ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
268- ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
267+ ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
268+ ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
269269; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
270270; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
271271; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
@@ -280,13 +280,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
280280; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
281281; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
282282; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
283- ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
284- ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
283+ ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
284+ ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
285+ ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
286+ ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
285287; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
286- ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
287- ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
288- ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
289- ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
288+ ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
289+ ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
290290; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
291291; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
292292; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
@@ -301,13 +301,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
301301; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
302302; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
303303; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
304- ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
305- ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
304+ ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
305+ ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
306+ ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
307+ ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
306308; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
307- ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
308- ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
309- ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
310- ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
309+ ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
310+ ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
311311; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
312312; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
313313; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0
@@ -322,13 +322,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
322322; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
323323; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
324324; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
325- ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
326- ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
325+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
326+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
327+ ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
328+ ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
327329; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
328- ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
329- ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
330- ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
331- ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
330+ ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
331+ ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
332332; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
333333; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
334334; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
0 commit comments