@@ -3129,8 +3129,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
31293129; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
31303130; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
31313131; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
3132- ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
31333132; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3133+ ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
31343134; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
31353135; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
31363136; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
@@ -3141,14 +3141,13 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
31413141; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
31423142; AVX2: # %bb.0:
31433143; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
3144- ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
31453144; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1
3146- ; AVX2-NEXT: vinserti128 $1, % xmm1, %ymm0, %ymm0
3147- ; AVX2-NEXT: vpbroadcastw (%rdi) , %ymm1
3148- ; AVX2-NEXT: vpaddb (%rsi), %ymm0 , %ymm0
3149- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1 , %ymm1
3150- ; AVX2-NEXT: vmovdqa %ymm1 , 32(%rdx)
3151- ; AVX2-NEXT: vmovdqa %ymm0 , (%rdx)
3145+ ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
3146+ ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1 , %ymm1
3147+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1 , %ymm1
3148+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0 , %ymm0
3149+ ; AVX2-NEXT: vmovdqa %ymm0 , 32(%rdx)
3150+ ; AVX2-NEXT: vmovdqa %ymm1 , (%rdx)
31523151; AVX2-NEXT: vzeroupper
31533152; AVX2-NEXT: retq
31543153;
@@ -3234,13 +3233,17 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
32343233; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
32353234; AVX: # %bb.0:
32363235; AVX-NEXT: vmovdqa (%rdi), %xmm0
3237- ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
3238- ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3239- ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
3236+ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
3237+ ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7]
3238+ ; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2
3239+ ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3240+ ; AVX-NEXT: vmovdqa (%rdi), %xmm3
3241+ ; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3
32403242; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
32413243; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
3242- ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
3243- ; AVX-NEXT: vmovdqa %xmm1, (%rdx)
3244+ ; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
3245+ ; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
3246+ ; AVX-NEXT: vmovdqa %xmm2, (%rdx)
32443247; AVX-NEXT: retq
32453248;
32463249; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
@@ -3516,16 +3519,16 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
35163519; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
35173520; SSE42: # %bb.0:
35183521; SSE42-NEXT: movdqa (%rdi), %xmm0
3519- ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1 ]
3520- ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0 [0,1,0,1 ]
3521- ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0 ]
3522- ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7 ]
3523- ; SSE42-NEXT: paddb (%rsi), %xmm0
3524- ; SSE42-NEXT: paddb 16(%rsi), %xmm2
3525- ; SSE42-NEXT: paddb 32(%rsi), %xmm1
3526- ; SSE42-NEXT: movdqa %xmm1 , 32(%rdx)
3527- ; SSE42-NEXT: movdqa %xmm2 , 16(%rdx)
3528- ; SSE42-NEXT: movdqa %xmm0 , (%rdx)
3522+ ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0 ]
3523+ ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1 [0,1],mem[2,3,4,5],xmm1[6,7 ]
3524+ ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1 ]
3525+ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1 ]
3526+ ; SSE42-NEXT: paddb (%rsi), %xmm1
3527+ ; SSE42-NEXT: paddb 16(%rsi), %xmm0
3528+ ; SSE42-NEXT: paddb 32(%rsi), %xmm2
3529+ ; SSE42-NEXT: movdqa %xmm2 , 32(%rdx)
3530+ ; SSE42-NEXT: movdqa %xmm0 , 16(%rdx)
3531+ ; SSE42-NEXT: movdqa %xmm1 , (%rdx)
35293532; SSE42-NEXT: retq
35303533;
35313534; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
@@ -3534,8 +3537,8 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
35343537; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
35353538; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
35363539; AVX-NEXT: vbroadcastss (%rdi), %xmm2
3540+ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
35373541; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3538- ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
35393542; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
35403543; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
35413544; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
@@ -3546,10 +3549,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
35463549; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
35473550; AVX2: # %bb.0:
35483551; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
3552+ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,0,1,1]
35493553; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
3550- ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
3551- ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
3552- ; AVX2-NEXT: vpbroadcastd (%rdi), %ymm1
3554+ ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
3555+ ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
35533556; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
35543557; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
35553558; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -3631,15 +3634,19 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
36313634;
36323635; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
36333636; AVX: # %bb.0:
3634- ; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
3635- ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
3636- ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3637- ; AVX-NEXT: vmovdqa (%rdi), %xmm1
3638- ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2
3639- ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
3640- ; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
3637+ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
3638+ ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
3639+ ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3640+ ; AVX-NEXT: vmovdqa (%rdi), %xmm2
3641+ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
3642+ ; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
3643+ ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
3644+ ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
36413645; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
3642- ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
3646+ ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
3647+ ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
3648+ ; AVX-NEXT: vmovdqa %xmm1, (%rdx)
3649+ ; AVX-NEXT: vzeroupper
36433650; AVX-NEXT: retq
36443651;
36453652; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -3701,25 +3708,26 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
37013708define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2 (ptr %in.elt.ptr , ptr %out.vec.bias.ptr , ptr %out.vec.ptr ) nounwind {
37023709; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
37033710; SSE2: # %bb.0:
3704- ; SSE2-NEXT: movdqa (%rdi), %xmm0
3711+ ; SSE2-NEXT: movaps (%rdi), %xmm0
37053712; SSE2-NEXT: movaps 48(%rdi), %xmm1
3706- ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
37073713; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
3708- ; SSE2-NEXT: paddb 16(%rsi), %xmm2
3714+ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3715+ ; SSE2-NEXT: paddb 16(%rsi), %xmm0
37093716; SSE2-NEXT: paddb (%rsi), %xmm1
37103717; SSE2-NEXT: movdqa %xmm1, (%rdx)
3711- ; SSE2-NEXT: movdqa %xmm2 , 16(%rdx)
3718+ ; SSE2-NEXT: movdqa %xmm0 , 16(%rdx)
37123719; SSE2-NEXT: retq
37133720;
37143721; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
37153722; SSE42: # %bb.0:
37163723; SSE42-NEXT: movdqa (%rdi), %xmm0
3717- ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
3718- ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
3719- ; SSE42-NEXT: paddb 16(%rsi), %xmm1
3720- ; SSE42-NEXT: paddb (%rsi), %xmm0
3721- ; SSE42-NEXT: movdqa %xmm0, (%rdx)
3722- ; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
3724+ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
3725+ ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
3726+ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3727+ ; SSE42-NEXT: paddb 16(%rsi), %xmm0
3728+ ; SSE42-NEXT: paddb (%rsi), %xmm1
3729+ ; SSE42-NEXT: movdqa %xmm1, (%rdx)
3730+ ; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
37233731; SSE42-NEXT: retq
37243732;
37253733; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
@@ -3812,15 +3820,19 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
38123820;
38133821; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
38143822; AVX: # %bb.0:
3815- ; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
3816- ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
3817- ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3818- ; AVX-NEXT: vmovdqa (%rdi), %xmm1
3819- ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2
3820- ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
3821- ; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
3823+ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
3824+ ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
3825+ ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3826+ ; AVX-NEXT: vmovdqa (%rdi), %xmm2
3827+ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
3828+ ; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
3829+ ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
3830+ ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
38223831; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
3823- ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
3832+ ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
3833+ ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
3834+ ; AVX-NEXT: vmovdqa %xmm1, (%rdx)
3835+ ; AVX-NEXT: vzeroupper
38243836; AVX-NEXT: retq
38253837;
38263838; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
0 commit comments