@@ -211,10 +211,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
211211; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
212212; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
213213; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
214- ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
215- ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
216- ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
214+ ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
215+ ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
217216; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
217+ ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
218218; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
219219; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
220220; AVX2-NEXT: vmovq %xmm1, 16(%rcx)
@@ -228,10 +228,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
228228; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
229229; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
230230; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
231- ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
232- ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
233- ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
231+ ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
232+ ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
234233; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
234+ ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
235235; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0
236236; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
237237; AVX2-FP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -245,10 +245,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
245245; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
246246; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
247247; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
248- ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
249- ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
250- ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
248+ ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
249+ ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
251250; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
251+ ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
252252; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
253253; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
254254; AVX2-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -262,10 +262,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
262262; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
263263; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
264264; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
265- ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
266- ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
267- ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
265+ ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
266+ ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
268267; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
268+ ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
269269; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
270270; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
271271; AVX512-NEXT: vmovq %xmm1, 16(%rcx)
@@ -279,10 +279,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
279279; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
280280; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
281281; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
282- ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
283- ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
284- ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
282+ ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
283+ ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
285284; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
285+ ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
286286; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
287287; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
288288; AVX512-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -296,10 +296,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
296296; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
297297; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
298298; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
299- ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
300- ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
301- ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
299+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
300+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
302301; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
302+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
303303; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
304304; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
305305; AVX512DQ-NEXT: vmovq %xmm1, 16(%rcx)
@@ -313,10 +313,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
313313; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
314314; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
315315; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
316- ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
317- ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
318- ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
316+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
317+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
319318; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
319+ ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
320320; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
321321; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
322322; AVX512DQ-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -330,12 +330,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
330330; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
331331; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
332332; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
333- ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
334- ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
335- ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
336- ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
337- ; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx)
338- ; AVX512BW-NEXT: vmovdqa %xmm0, (%rcx)
333+ ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
334+ ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
335+ ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
336+ ; AVX512BW-NEXT: vmovq %xmm0, 16(%rcx)
337+ ; AVX512BW-NEXT: vmovdqa %xmm2, (%rcx)
339338; AVX512BW-NEXT: vzeroupper
340339; AVX512BW-NEXT: retq
341340;
@@ -345,12 +344,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
345344; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
346345; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
347346; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
348- ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
349- ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
350- ; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
351- ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
352- ; AVX512BW-FCP-NEXT: vmovq %xmm1, 16(%rcx)
353- ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rcx)
347+ ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
348+ ; AVX512BW-FCP-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
349+ ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
350+ ; AVX512BW-FCP-NEXT: vmovq %xmm0, 16(%rcx)
351+ ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
354352; AVX512BW-FCP-NEXT: vzeroupper
355353; AVX512BW-FCP-NEXT: retq
356354;
@@ -360,12 +358,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
360358; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
361359; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
362360; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
363- ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
364- ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
365- ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
366- ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1
367- ; AVX512DQ-BW-NEXT: vmovq %xmm1, 16(%rcx)
368- ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rcx)
361+ ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
362+ ; AVX512DQ-BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
363+ ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0
364+ ; AVX512DQ-BW-NEXT: vmovq %xmm0, 16(%rcx)
365+ ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rcx)
369366; AVX512DQ-BW-NEXT: vzeroupper
370367; AVX512DQ-BW-NEXT: retq
371368;
@@ -375,12 +372,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
375372; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
376373; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
377374; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
378- ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
379- ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
380- ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
381- ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
382- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 16(%rcx)
383- ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rcx)
375+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
376+ ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
377+ ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
378+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 16(%rcx)
379+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
384380; AVX512DQ-BW-FCP-NEXT: vzeroupper
385381; AVX512DQ-BW-FCP-NEXT: retq
386382 %in.vec0 = load <4 x i16 >, ptr %in.vecptr0 , align 64
0 commit comments