@@ -2410,19 +2410,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
24102410; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3
24112411; AVX512-NEXT: vmovdqa 112(%rdi), %xmm4
24122412; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5
2413- ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
24142413; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2415- ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2416- ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2417- ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2418- ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
24192414; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
24202415; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
24212416; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2417+ ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
24222418; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2419+ ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
24232420; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2421+ ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
24242422; AVX512-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2423+ ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
24252424; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2425+ ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
24262426; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
24272427; AVX512-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
24282428; AVX512-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2457,19 +2457,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
24572457; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
24582458; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
24592459; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
2460- ; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
24612460; AVX512-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2462- ; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2463- ; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2464- ; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2465- ; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
24662461; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
24672462; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1]
24682463; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2464+ ; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
24692465; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2466+ ; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
24702467; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2468+ ; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
24712469; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2470+ ; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
24722471; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2472+ ; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
24732473; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
24742474; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
24752475; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2504,19 +2504,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
25042504; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3
25052505; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm4
25062506; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5
2507- ; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
25082507; AVX512DQ-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2509- ; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2510- ; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2511- ; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2512- ; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
25132508; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
25142509; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1]
25152510; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2511+ ; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
25162512; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2513+ ; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
25172514; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2515+ ; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
25182516; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2517+ ; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
25192518; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2519+ ; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
25202520; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
25212521; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
25222522; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2551,19 +2551,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
25512551; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
25522552; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
25532553; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
2554- ; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
25552554; AVX512DQ-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2556- ; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2557- ; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2558- ; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2559- ; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
25602555; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
25612556; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1]
25622557; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2558+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
25632559; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2560+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
25642561; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2562+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
25652563; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2564+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
25662565; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2566+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
25672567; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
25682568; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
25692569; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
0 commit comments