@@ -106,7 +106,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
106106; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
107107; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
108108; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
109- ; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [1,5,1,1]
109+ ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
110110; AVX512-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
111111; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
112112; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
@@ -135,7 +135,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
135135; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
136136; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
137137; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
138- ; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [1,5,1,1]
138+ ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
139139; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
140140; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
141141; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
@@ -164,7 +164,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
164164; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
165165; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
166166; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
167- ; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [1,5,1,1]
167+ ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
168168; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
169169; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
170170; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
@@ -193,7 +193,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
193193; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
194194; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
195195; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
196- ; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [1,5,1,1]
196+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
197197; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
198198; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
199199; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
@@ -364,14 +364,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
364364;
365365; AVX512-LABEL: load_i32_stride4_vf4:
366366; AVX512: # %bb.0:
367- ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12]
367+ ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
368368; AVX512-NEXT: vmovaps (%rdi), %zmm1
369369; AVX512-NEXT: vpermps %zmm1, %zmm0, %zmm0
370- ; AVX512-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13]
370+ ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
371371; AVX512-NEXT: vpermps %zmm1, %zmm2, %zmm2
372- ; AVX512-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14]
372+ ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
373373; AVX512-NEXT: vpermps %zmm1, %zmm3, %zmm3
374- ; AVX512-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15]
374+ ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
375375; AVX512-NEXT: vpermps %zmm1, %zmm4, %zmm1
376376; AVX512-NEXT: vmovaps %xmm0, (%rsi)
377377; AVX512-NEXT: vmovaps %xmm2, (%rdx)
@@ -382,14 +382,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
382382;
383383; AVX512-FCP-LABEL: load_i32_stride4_vf4:
384384; AVX512-FCP: # %bb.0:
385- ; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12]
385+ ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
386386; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1
387387; AVX512-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0
388- ; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13]
388+ ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
389389; AVX512-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2
390- ; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14]
390+ ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
391391; AVX512-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3
392- ; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15]
392+ ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
393393; AVX512-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1
394394; AVX512-FCP-NEXT: vmovaps %xmm0, (%rsi)
395395; AVX512-FCP-NEXT: vmovaps %xmm2, (%rdx)
@@ -400,14 +400,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
400400;
401401; AVX512DQ-LABEL: load_i32_stride4_vf4:
402402; AVX512DQ: # %bb.0:
403- ; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12]
403+ ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
404404; AVX512DQ-NEXT: vmovaps (%rdi), %zmm1
405405; AVX512DQ-NEXT: vpermps %zmm1, %zmm0, %zmm0
406- ; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13]
406+ ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
407407; AVX512DQ-NEXT: vpermps %zmm1, %zmm2, %zmm2
408- ; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14]
408+ ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
409409; AVX512DQ-NEXT: vpermps %zmm1, %zmm3, %zmm3
410- ; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15]
410+ ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
411411; AVX512DQ-NEXT: vpermps %zmm1, %zmm4, %zmm1
412412; AVX512DQ-NEXT: vmovaps %xmm0, (%rsi)
413413; AVX512DQ-NEXT: vmovaps %xmm2, (%rdx)
@@ -418,14 +418,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
418418;
419419; AVX512DQ-FCP-LABEL: load_i32_stride4_vf4:
420420; AVX512DQ-FCP: # %bb.0:
421- ; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12]
421+ ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
422422; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1
423423; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0
424- ; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13]
424+ ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
425425; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2
426- ; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14]
426+ ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
427427; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3
428- ; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15]
428+ ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
429429; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1
430430; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rsi)
431431; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rdx)
@@ -436,14 +436,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
436436;
437437; AVX512BW-LABEL: load_i32_stride4_vf4:
438438; AVX512BW: # %bb.0:
439- ; AVX512BW-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12]
439+ ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
440440; AVX512BW-NEXT: vmovaps (%rdi), %zmm1
441441; AVX512BW-NEXT: vpermps %zmm1, %zmm0, %zmm0
442- ; AVX512BW-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13]
442+ ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
443443; AVX512BW-NEXT: vpermps %zmm1, %zmm2, %zmm2
444- ; AVX512BW-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14]
444+ ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
445445; AVX512BW-NEXT: vpermps %zmm1, %zmm3, %zmm3
446- ; AVX512BW-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15]
446+ ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
447447; AVX512BW-NEXT: vpermps %zmm1, %zmm4, %zmm1
448448; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
449449; AVX512BW-NEXT: vmovaps %xmm2, (%rdx)
@@ -454,14 +454,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
454454;
455455; AVX512BW-FCP-LABEL: load_i32_stride4_vf4:
456456; AVX512BW-FCP: # %bb.0:
457- ; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12]
457+ ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
458458; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1
459459; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0
460- ; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13]
460+ ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
461461; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2
462- ; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14]
462+ ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
463463; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3
464- ; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15]
464+ ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
465465; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1
466466; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rsi)
467467; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rdx)
@@ -472,14 +472,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
472472;
473473; AVX512DQ-BW-LABEL: load_i32_stride4_vf4:
474474; AVX512DQ-BW: # %bb.0:
475- ; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12]
475+ ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
476476; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm1
477477; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm0, %zmm0
478- ; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13]
478+ ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
479479; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm2, %zmm2
480- ; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14]
480+ ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
481481; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm3, %zmm3
482- ; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15]
482+ ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
483483; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm4, %zmm1
484484; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rsi)
485485; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%rdx)
@@ -490,14 +490,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
490490;
491491; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf4:
492492; AVX512DQ-BW-FCP: # %bb.0:
493- ; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12]
493+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
494494; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1
495495; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0
496- ; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13]
496+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
497497; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2
498- ; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14]
498+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
499499; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3
500- ; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15]
500+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
501501; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1
502502; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rsi)
503503; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rdx)
0 commit comments