@@ -911,8 +911,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
911911; AVX2: # %bb.0:
912912; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
913913; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
914- ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
915- ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
914+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
916915; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
917916; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
918917; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -1009,8 +1008,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
10091008;
10101009; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
10111010; AVX2: # %bb.0:
1012- ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
1013- ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
1011+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0
10141012; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
10151013; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
10161014; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -1106,8 +1104,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
11061104;
11071105; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
11081106; AVX2: # %bb.0:
1109- ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
1110- ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
1107+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0
11111108; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
11121109; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
11131110; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -1304,8 +1301,7 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.
13041301;
13051302; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
13061303; AVX2: # %bb.0:
1307- ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
1308- ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
1304+ ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
13091305; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15]
13101306; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
13111307; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -1392,8 +1388,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
13921388;
13931389; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
13941390; AVX2: # %bb.0:
1395- ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
1396- ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
1391+ ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
13971392; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15]
13981393; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
13991394; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -2904,8 +2899,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
29042899; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
29052900; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
29062901; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2907- ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
2908- ; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
2902+ ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
29092903; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
29102904; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
29112905; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
@@ -3174,49 +3168,45 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
31743168; AVX2-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
31753169; AVX2-SLOW: # %bb.0:
31763170; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0
3177- ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1
3171+ ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %ymm1
3172+ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
3173+ ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
31783174; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
31793175; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
3180- ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
3181- ; AVX2-SLOW-NEXT: vpbroadcastw %xmm2, %ymm2
3182- ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15]
3183- ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3184- ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
31853176; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3186- ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx)
3177+ ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
31873178; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
3179+ ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx)
31883180; AVX2-SLOW-NEXT: vzeroupper
31893181; AVX2-SLOW-NEXT: retq
31903182;
31913183; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
31923184; AVX2-FAST-PERLANE: # %bb.0:
31933185; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
31943186; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1
3195- ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
3196- ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
3197- ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm2, %ymm2
3198- ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3199- ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3200- ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1
3201- ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3202- ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx)
3203- ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx)
3187+ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
3188+ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %ymm0
3189+ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3190+ ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3191+ ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3192+ ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm2, %ymm1
3193+ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx)
3194+ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx)
32043195; AVX2-FAST-PERLANE-NEXT: vzeroupper
32053196; AVX2-FAST-PERLANE-NEXT: retq
32063197;
32073198; AVX2-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
32083199; AVX2-FAST: # %bb.0:
32093200; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
32103201; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
3211- ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
3212- ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
3213- ; AVX2-FAST-NEXT: vpbroadcastw %xmm2, %ymm2
3214- ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3215- ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3216- ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1
3217- ; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3218- ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx)
3219- ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx)
3202+ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
3203+ ; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %ymm0
3204+ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3205+ ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3206+ ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3207+ ; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm2, %ymm1
3208+ ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx)
3209+ ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx)
32203210; AVX2-FAST-NEXT: vzeroupper
32213211; AVX2-FAST-NEXT: retq
32223212;
0 commit comments