Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9800,6 +9800,24 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
MaskSize == (int)ExpectedOp.getNumOperands())
return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
break;
case ISD::BITCAST:
if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have more than one bitcast adjacent? Can we use getOperand(0)?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it happens as DAG combines still don't occur in topological order - and its common in shuffle lowering where we are bitcasting between shuffle widths so much.

SDValue Src = peekThroughBitcasts(Op);
EVT SrcVT = Src.getValueType();
if (SrcVT.isVector() &&
(SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
return (Idx % Scale) == (ExpectedIdx % Scale) &&
IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
Idx / Scale, ExpectedIdx / Scale);
}
}
break;
case ISD::VECTOR_SHUFFLE: {
auto *SVN = cast<ShuffleVectorSDNode>(Op);
return Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize &&
SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
}
case X86ISD::VBROADCAST:
case X86ISD::VBROADCAST_LOAD:
// TODO: Handle MaskSize != VT.getVectorNumElements()?
Expand Down Expand Up @@ -12780,8 +12798,13 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

// Check that the mask is a broadcast.
int BroadcastIdx = getSplatIndex(Mask);
if (BroadcastIdx < 0)
return SDValue();
if (BroadcastIdx < 0) {
// Check for hidden broadcast.
SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
return SDValue();
BroadcastIdx = 0;
}
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast "
"comes from V1.");
Expand Down
91 changes: 39 additions & 52 deletions llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1220,7 +1220,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
Expand All @@ -1234,7 +1234,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1247,7 +1247,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1259,10 +1259,9 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
Expand Down Expand Up @@ -1345,7 +1344,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
Expand All @@ -1359,7 +1358,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1372,7 +1371,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1384,10 +1383,9 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
Expand Down Expand Up @@ -1719,7 +1717,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1732,7 +1730,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1745,7 +1743,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
Expand Down Expand Up @@ -2691,14 +2689,13 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
Expand All @@ -2708,10 +2705,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX512F-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
Expand All @@ -2724,10 +2720,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
Expand All @@ -2739,11 +2734,10 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
Expand Down Expand Up @@ -2959,14 +2953,13 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
Expand All @@ -2976,10 +2969,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
Expand All @@ -2992,10 +2984,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
Expand All @@ -3007,11 +2998,10 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
Expand Down Expand Up @@ -3742,14 +3732,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
Expand All @@ -3759,9 +3748,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
Expand All @@ -3775,9 +3763,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
Expand Down
Loading