@@ -196,18 +196,18 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
196196; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
197197; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
198198; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
199- ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,3,4,5,6,7]
199+ ; SSE-NEXT: movq %xmm2, (%rsi)
200+ ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,3,2,3,4,5,6,7]
200201; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
201202; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
202203; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
203204; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
204205; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
205206; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
207+ ; SSE-NEXT: movq %xmm1, (%rdx)
206208; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
207209; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
208- ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
209- ; SSE-NEXT: movq %xmm2, (%rsi)
210- ; SSE-NEXT: movq %xmm1, (%rdx)
210+ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
211211; SSE-NEXT: movq %xmm0, (%rcx)
212212; SSE-NEXT: retq
213213;
@@ -217,14 +217,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
217217; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
218218; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
219219; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
220- ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
221- ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
220+ ; AVX-NEXT: vmovq %xmm2, (%rsi)
221+ ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
222+ ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
223+ ; AVX-NEXT: vmovq %xmm2, (%rdx)
222224; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
223225; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
224226; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
225227; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
226- ; AVX-NEXT: vmovq %xmm2, (%rsi)
227- ; AVX-NEXT: vmovq %xmm3, (%rdx)
228228; AVX-NEXT: vmovq %xmm0, (%rcx)
229229; AVX-NEXT: retq
230230;
@@ -234,14 +234,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
234234; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
235235; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
236236; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
237- ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
238- ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
237+ ; AVX2-NEXT: vmovq %xmm2, (%rsi)
238+ ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
239+ ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
240+ ; AVX2-NEXT: vmovq %xmm2, (%rdx)
239241; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
240242; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
241243; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
242244; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
243- ; AVX2-NEXT: vmovq %xmm2, (%rsi)
244- ; AVX2-NEXT: vmovq %xmm3, (%rdx)
245245; AVX2-NEXT: vmovq %xmm0, (%rcx)
246246; AVX2-NEXT: retq
247247;
@@ -251,13 +251,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
251251; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
252252; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
253253; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
254- ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
255- ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
254+ ; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
255+ ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
256+ ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
257+ ; AVX2-FP-NEXT: vmovq %xmm2, (%rdx)
256258; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
257259; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
258260; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
259- ; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
260- ; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
261261; AVX2-FP-NEXT: vmovq %xmm0, (%rcx)
262262; AVX2-FP-NEXT: retq
263263;
@@ -267,13 +267,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
267267; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
268268; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
269269; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
270- ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
271- ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
270+ ; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
271+ ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
272+ ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
273+ ; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx)
272274; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
273275; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
274276; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
275- ; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
276- ; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
277277; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx)
278278; AVX2-FCP-NEXT: retq
279279;
@@ -283,14 +283,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
283283; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
284284; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
285285; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
286- ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
287- ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
286+ ; AVX512-NEXT: vmovq %xmm2, (%rsi)
287+ ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
288+ ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
289+ ; AVX512-NEXT: vmovq %xmm2, (%rdx)
288290; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
289291; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
290292; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
291293; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
292- ; AVX512-NEXT: vmovq %xmm2, (%rsi)
293- ; AVX512-NEXT: vmovq %xmm3, (%rdx)
294294; AVX512-NEXT: vmovq %xmm0, (%rcx)
295295; AVX512-NEXT: retq
296296;
@@ -300,13 +300,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
300300; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
301301; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
302302; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
303- ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
304- ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
303+ ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
304+ ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
305+ ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
306+ ; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx)
305307; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
306308; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
307309; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
308- ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
309- ; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
310310; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
311311; AVX512-FCP-NEXT: retq
312312;
@@ -316,14 +316,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
316316; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
317317; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
318318; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
319- ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
320- ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
319+ ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
320+ ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
321+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
322+ ; AVX512DQ-NEXT: vmovq %xmm2, (%rdx)
321323; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
322324; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
323325; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
324326; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
325- ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
326- ; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
327327; AVX512DQ-NEXT: vmovq %xmm0, (%rcx)
328328; AVX512DQ-NEXT: retq
329329;
@@ -333,13 +333,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
333333; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
334334; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
335335; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
336- ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
337- ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
336+ ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
337+ ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
338+ ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
339+ ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx)
338340; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
339341; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
340342; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
341- ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
342- ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
343343; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
344344; AVX512DQ-FCP-NEXT: retq
345345;
@@ -348,15 +348,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
348348; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
349349; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
350350; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
351- ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
352- ; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
353351; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7]
354- ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3]
355- ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
356- ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
352+ ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3
357353; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
358- ; AVX512BW-NEXT: vmovq %xmm1, (%rdx)
359- ; AVX512BW-NEXT: vmovq %xmm2, (%rcx)
354+ ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
355+ ; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
356+ ; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
357+ ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3]
358+ ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
359+ ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
360+ ; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
360361; AVX512BW-NEXT: vzeroupper
361362; AVX512BW-NEXT: retq
362363;
@@ -365,13 +366,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
365366; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
366367; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
367368; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
368- ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
369- ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2
370- ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11]
371- ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1
372369; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi)
373- ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
374- ; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rcx)
370+ ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
371+ ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
372+ ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
373+ ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11]
374+ ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
375+ ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
375376; AVX512BW-FCP-NEXT: vzeroupper
376377; AVX512BW-FCP-NEXT: retq
377378;
@@ -380,15 +381,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
380381; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
381382; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
382383; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
383- ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
384- ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
385384; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7]
386- ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3]
387- ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
388- ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
385+ ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm3
389386; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi)
390- ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx)
391- ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx)
387+ ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
388+ ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
389+ ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
390+ ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3]
391+ ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
392+ ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
393+ ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
392394; AVX512DQ-BW-NEXT: vzeroupper
393395; AVX512DQ-BW-NEXT: retq
394396;
@@ -397,13 +399,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
397399; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
398400; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
399401; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
400- ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
401- ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2
402- ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11]
403- ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1
404402; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi)
405- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
406- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rcx)
403+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
404+ ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
405+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
406+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11]
407+ ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
408+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
407409; AVX512DQ-BW-FCP-NEXT: vzeroupper
408410; AVX512DQ-BW-FCP-NEXT: retq
409411 %wide.vec = load <12 x i16 >, ptr %in.vec , align 64
0 commit comments