@@ -910,8 +910,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
910910; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
911911; AVX2: # %bb.0:
912912; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
913- ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
914913; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
914+ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
915915; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
916916; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
917917; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -920,32 +920,32 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
920920;
921921; AVX512F-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
922922; AVX512F: # %bb.0:
923- ; AVX512F-NEXT: vmovdqa 32 (%rdi), %ymm0
924- ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
925- ; AVX512F-NEXT: vpbroadcastb (%rdi), % ymm1
926- ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0], ymm0[0],ymm1[1 ],ymm0[1],ymm1[2 ],ymm0[2],ymm1[3 ],ymm0[3],ymm1[4 ],ymm0[4],ymm1[5 ],ymm0[5],ymm1[6 ],ymm0[6],ymm1[7 ],ymm0[7],ymm1[16 ],ymm0[16],ymm1[17 ],ymm0[17],ymm1[18 ],ymm0[18],ymm1[19 ],ymm0[19],ymm1[20 ],ymm0[20],ymm1[21 ],ymm0[21],ymm1[22 ],ymm0[22],ymm1[23 ],ymm0[23]
923+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
924+ ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
925+ ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
926+ ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0 ],ymm0[1],ymm1[1 ],ymm0[2],ymm1[2 ],ymm0[3],ymm1[3 ],ymm0[4],ymm1[4 ],ymm0[5],ymm1[5 ],ymm0[6],ymm1[6 ],ymm0[7],ymm1[7 ],ymm0[16],ymm1[16 ],ymm0[17],ymm1[17 ],ymm0[18],ymm1[18 ],ymm0[19],ymm1[19 ],ymm0[20],ymm1[20 ],ymm0[21],ymm1[21 ],ymm0[22],ymm1[22 ],ymm0[23],ymm1 [23]
927927; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
928928; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
929929; AVX512F-NEXT: vzeroupper
930930; AVX512F-NEXT: retq
931931;
932932; AVX512DQ-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
933933; AVX512DQ: # %bb.0:
934- ; AVX512DQ-NEXT: vmovdqa 32 (%rdi), %ymm0
935- ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
936- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), % ymm1
937- ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0], ymm0[0],ymm1[1 ],ymm0[1],ymm1[2 ],ymm0[2],ymm1[3 ],ymm0[3],ymm1[4 ],ymm0[4],ymm1[5 ],ymm0[5],ymm1[6 ],ymm0[6],ymm1[7 ],ymm0[7],ymm1[16 ],ymm0[16],ymm1[17 ],ymm0[17],ymm1[18 ],ymm0[18],ymm1[19 ],ymm0[19],ymm1[20 ],ymm0[20],ymm1[21 ],ymm0[21],ymm1[22 ],ymm0[22],ymm1[23 ],ymm0[23]
934+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
935+ ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
936+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
937+ ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0 ],ymm0[1],ymm1[1 ],ymm0[2],ymm1[2 ],ymm0[3],ymm1[3 ],ymm0[4],ymm1[4 ],ymm0[5],ymm1[5 ],ymm0[6],ymm1[6 ],ymm0[7],ymm1[7 ],ymm0[16],ymm1[16 ],ymm0[17],ymm1[17 ],ymm0[18],ymm1[18 ],ymm0[19],ymm1[19 ],ymm0[20],ymm1[20 ],ymm0[21],ymm1[21 ],ymm0[22],ymm1[22 ],ymm0[23],ymm1 [23]
938938; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
939939; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
940940; AVX512DQ-NEXT: vzeroupper
941941; AVX512DQ-NEXT: retq
942942;
943943; AVX512BW-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
944944; AVX512BW: # %bb.0:
945- ; AVX512BW-NEXT: vmovdqa 32 (%rdi), %ymm0
946- ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
947- ; AVX512BW-NEXT: vpbroadcastb (%rdi), % ymm1
948- ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0], ymm0[0],ymm1[1 ],ymm0[1],ymm1[2 ],ymm0[2],ymm1[3 ],ymm0[3],ymm1[4 ],ymm0[4],ymm1[5 ],ymm0[5],ymm1[6 ],ymm0[6],ymm1[7 ],ymm0[7],ymm1[16 ],ymm0[16],ymm1[17 ],ymm0[17],ymm1[18 ],ymm0[18],ymm1[19 ],ymm0[19],ymm1[20 ],ymm0[20],ymm1[21 ],ymm0[21],ymm1[22 ],ymm0[22],ymm1[23 ],ymm0[23]
945+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0
946+ ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
947+ ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
948+ ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0 ],ymm0[1],ymm1[1 ],ymm0[2],ymm1[2 ],ymm0[3],ymm1[3 ],ymm0[4],ymm1[4 ],ymm0[5],ymm1[5 ],ymm0[6],ymm1[6 ],ymm0[7],ymm1[7 ],ymm0[16],ymm1[16 ],ymm0[17],ymm1[17 ],ymm0[18],ymm1[18 ],ymm0[19],ymm1[19 ],ymm0[20],ymm1[20 ],ymm0[21],ymm1[21 ],ymm0[22],ymm1[22 ],ymm0[23],ymm1 [23]
949949; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
950950; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
951951; AVX512BW-NEXT: vzeroupper
@@ -1906,12 +1906,10 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
19061906; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
19071907; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
19081908; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1909- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
1910- ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
1911- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
19121909; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1913- ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
1910+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
19141911; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
1912+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
19151913; AVX512F-NEXT: vzeroupper
19161914; AVX512F-NEXT: retq
19171915;
@@ -1921,12 +1919,10 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
19211919; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
19221920; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
19231921; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1924- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
1925- ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
1926- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
19271922; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1928- ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
1923+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
19291924; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
1925+ ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
19301926; AVX512DQ-NEXT: vzeroupper
19311927; AVX512DQ-NEXT: retq
19321928;
@@ -1936,7 +1932,6 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
19361932; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
19371933; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
19381934; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1939- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
19401935; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
19411936; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
19421937; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2133,37 +2128,33 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
21332128; AVX512F: # %bb.0:
21342129; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
21352130; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2136- ; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
2137- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2138- ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2139- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2140- ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2141- ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
2142- ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2131+ ; AVX512F-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
2132+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2133+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2134+ ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2135+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
21432136; AVX512F-NEXT: vzeroupper
21442137; AVX512F-NEXT: retq
21452138;
21462139; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
21472140; AVX512DQ: # %bb.0:
21482141; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
21492142; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2150- ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
2151- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2152- ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2153- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2154- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2155- ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
2156- ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2143+ ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
2144+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2145+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2146+ ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2147+ ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
21572148; AVX512DQ-NEXT: vzeroupper
21582149; AVX512DQ-NEXT: retq
21592150;
21602151; AVX512BW-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
21612152; AVX512BW: # %bb.0:
21622153; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
2154+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
21632155; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
21642156; AVX512BW-NEXT: kmovd %eax, %k1
2165- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
2166- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2157+ ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
21672158; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
21682159; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
21692160; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2364,37 +2355,33 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
23642355; AVX512F: # %bb.0:
23652356; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
23662357; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2367- ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
2368- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2369- ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2370- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2371- ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2372- ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
2373- ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2358+ ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
2359+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2360+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2361+ ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2362+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
23742363; AVX512F-NEXT: vzeroupper
23752364; AVX512F-NEXT: retq
23762365;
23772366; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
23782367; AVX512DQ: # %bb.0:
23792368; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
23802369; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2381- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
2382- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2383- ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2384- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2385- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2386- ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
2387- ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2370+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
2371+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2372+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2373+ ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2374+ ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
23882375; AVX512DQ-NEXT: vzeroupper
23892376; AVX512DQ-NEXT: retq
23902377;
23912378; AVX512BW-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
23922379; AVX512BW: # %bb.0:
23932380; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
2381+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
23942382; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
23952383; AVX512BW-NEXT: kmovd %eax, %k1
2396- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
2397- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2384+ ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
23982385; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
23992386; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
24002387; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2817,7 +2804,6 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
28172804; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
28182805; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
28192806; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
2820- ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
28212807; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
28222808; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
28232809; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2830,7 +2816,6 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
28302816; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
28312817; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
28322818; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
2833- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
28342819; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
28352820; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
28362821; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2907,8 +2892,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
29072892; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
29082893; AVX2: # %bb.0:
29092894; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
2910- ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1
2911- ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2],xmm1[3],mem[4,5],xmm1[6],mem[7]
2895+ ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
29122896; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
29132897; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
29142898; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2920,26 +2904,24 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
29202904; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
29212905; AVX512F: # %bb.0:
29222906; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
2923- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
2924- ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1
2925- ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2926- ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2927- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2928- ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2929- ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
2907+ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
2908+ ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2909+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2910+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2911+ ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2912+ ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
29302913; AVX512F-NEXT: vzeroupper
29312914; AVX512F-NEXT: retq
29322915;
29332916; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
29342917; AVX512DQ: # %bb.0:
29352918; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
2936- ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
2937- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1
2938- ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2939- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2940- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2941- ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2942- ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
2919+ ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
2920+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2921+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2922+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2923+ ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2924+ ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
29432925; AVX512DQ-NEXT: vzeroupper
29442926; AVX512DQ-NEXT: retq
29452927;
@@ -3024,7 +3006,6 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
30243006; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
30253007; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
30263008; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
3027- ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
30283009; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
30293010; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
30303011; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -3037,7 +3018,6 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
30373018; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
30383019; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
30393020; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
3040- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
30413021; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
30423022; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
30433023; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -3117,8 +3097,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
31173097; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
31183098; AVX2: # %bb.0:
31193099; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
3120- ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1
3121- ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
3100+ ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
31223101; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
31233102; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
31243103; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -3130,9 +3109,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
31303109; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
31313110; AVX512F: # %bb.0:
31323111; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
3133- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3134- ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1
3135- ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3112+ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3113+ ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
31363114; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
31373115; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
31383116; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
@@ -3144,9 +3122,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
31443122; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
31453123; AVX512DQ: # %bb.0:
31463124; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
3147- ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3148- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1
3149- ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3125+ ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3126+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
31503127; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
31513128; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
31523129; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
0 commit comments