@@ -1888,14 +1888,15 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
18881888;
18891889; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
18901890; AVX2: # %bb.0:
1891- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
1892- ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1893- ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
1894- ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1895- ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1896- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1897- ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
1898- ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
1891+ ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
1892+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
1893+ ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1894+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
1895+ ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
1896+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
1897+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1898+ ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
1899+ ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
18991900; AVX2-NEXT: vzeroupper
19001901; AVX2-NEXT: retq
19011902;
@@ -2111,14 +2112,15 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
21112112;
21122113; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
21132114; AVX2: # %bb.0:
2114- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2115- ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
2116- ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2117- ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2118- ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2119- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2120- ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2121- ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2115+ ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
2116+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2117+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
2118+ ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2119+ ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2120+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2121+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2122+ ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2123+ ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
21222124; AVX2-NEXT: vzeroupper
21232125; AVX2-NEXT: retq
21242126;
@@ -2235,29 +2237,33 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
22352237;
22362238; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
22372239; AVX512F: # %bb.0:
2238- ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2239- ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2240- ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2241- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2242- ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2243- ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2244- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2245- ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2246- ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
2240+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2241+ ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2242+ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
2243+ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2244+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2245+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2
2246+ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2247+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2248+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2249+ ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2250+ ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
22472251; AVX512F-NEXT: vzeroupper
22482252; AVX512F-NEXT: retq
22492253;
22502254; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
22512255; AVX512DQ: # %bb.0:
2252- ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2253- ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2254- ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2255- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2256- ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2257- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2258- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2259- ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2260- ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
2256+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2257+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2258+ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
2259+ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2260+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2261+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2
2262+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2263+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2264+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2265+ ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2266+ ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
22612267; AVX512DQ-NEXT: vzeroupper
22622268; AVX512DQ-NEXT: retq
22632269;
@@ -2266,8 +2272,9 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
22662272; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
22672273; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
22682274; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2269- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2275+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1
22702276; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2277+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
22712278; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
22722279; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
22732280; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2332,14 +2339,15 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
23322339;
23332340; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
23342341; AVX2: # %bb.0:
2335- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2336- ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
2337- ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2338- ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2339- ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2340- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2341- ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2342- ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2342+ ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
2343+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2344+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
2345+ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2346+ ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2347+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2348+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2349+ ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2350+ ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
23432351; AVX2-NEXT: vzeroupper
23442352; AVX2-NEXT: retq
23452353;
@@ -2454,29 +2462,33 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
24542462;
24552463; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
24562464; AVX512F: # %bb.0:
2457- ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2458- ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2459- ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2460- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2461- ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2462- ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2463- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2464- ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2465- ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
2465+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2466+ ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2467+ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
2468+ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2469+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2470+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2
2471+ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2472+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2473+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2474+ ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2475+ ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
24662476; AVX512F-NEXT: vzeroupper
24672477; AVX512F-NEXT: retq
24682478;
24692479; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
24702480; AVX512DQ: # %bb.0:
2471- ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2472- ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2473- ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2474- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2475- ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2476- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2477- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2478- ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2479- ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
2481+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2482+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2483+ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
2484+ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2485+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2486+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2
2487+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2488+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2489+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2490+ ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2491+ ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
24802492; AVX512DQ-NEXT: vzeroupper
24812493; AVX512DQ-NEXT: retq
24822494;
@@ -2485,8 +2497,9 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
24852497; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
24862498; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
24872499; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2488- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2500+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1
24892501; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2502+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
24902503; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
24912504; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
24922505; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2775,13 +2788,14 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
27752788;
27762789; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
27772790; AVX2: # %bb.0:
2778- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2779- ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
2780- ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
2781- ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2782- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2783- ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2784- ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2791+ ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
2792+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2793+ ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
2794+ ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
2795+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2796+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2797+ ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2798+ ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
27852799; AVX2-NEXT: vzeroupper
27862800; AVX2-NEXT: retq
27872801;
@@ -2976,13 +2990,14 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
29762990;
29772991; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
29782992; AVX2: # %bb.0:
2979- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2980- ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
2981- ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
2982- ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2983- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2984- ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2985- ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2993+ ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
2994+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2995+ ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
2996+ ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
2997+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2998+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2999+ ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
3000+ ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
29863001; AVX2-NEXT: vzeroupper
29873002; AVX2-NEXT: retq
29883003;
@@ -3093,25 +3108,27 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
30933108;
30943109; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
30953110; AVX512F: # %bb.0:
3096- ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0
3111+ ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
30973112; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3098- ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3099- ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
3100- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3101- ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
3102- ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
3113+ ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3114+ ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
3115+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3116+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3117+ ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
3118+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
31033119; AVX512F-NEXT: vzeroupper
31043120; AVX512F-NEXT: retq
31053121;
31063122; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
31073123; AVX512DQ: # %bb.0:
3108- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0
3124+ ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
31093125; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3110- ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3111- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
3112- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3113- ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
3114- ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
3126+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3127+ ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
3128+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3129+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3130+ ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
3131+ ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
31153132; AVX512DQ-NEXT: vzeroupper
31163133; AVX512DQ-NEXT: retq
31173134;
0 commit comments