Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 43 additions & 16 deletions llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1139,24 +1139,51 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
break;
}
case ISD::VSELECT: {
// Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
if (EleVT == MVT::i1)
break;

assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
"We can't replace VSELECT with BLENDV in vXi16!");
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
EVT CondVT = Cond.getValueType();
EVT EleVT = CondVT.getVectorElementType();
SDValue R;
if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
EleVT.getSizeInBits()) {
R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
N->getOperand(0), N->getOperand(1), N->getOperand(2),
CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));

if (EleVT == MVT::i1) {
assert(Subtarget->hasAVX512() && "Expected AVX512 support!");
if (!ISD::isBuildVectorAllZeros(LHS.getNode()) ||
ISD::isBuildVectorAllZeros(RHS.getNode()))
break;
// If this an avx512 target we can improve the use of zero masking by
// swapping the operands and inverting the condition.
// vselect cond, op1, op2 = vselect not(cond), op2, op1
if (Cond.getOpcode() == ISD::SETCC &&
!ISD::isBuildVectorAllZeros(Cond.getOperand(0).getNode())) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
R = CurDAG->getSetCC(SDLoc(N), CondVT, Cond.getOperand(0),
Cond.getOperand(1), CC);
} else if (Cond.getOpcode() == X86ISD::CMPM &&
Cond.getConstantOperandVal(2) == 0) {
// FLIP FCMP EQ -> (U)NE
R = CurDAG->getNode(Cond.getOpcode(), SDLoc(N), CondVT,
Cond.getOperand(0), Cond.getOperand(1),
CurDAG->getTargetConstant(4, SDLoc(N), MVT::i8));
} else {
R = CurDAG->getNOT(SDLoc(N), Cond, CondVT);
}
R = CurDAG->getSelect(SDLoc(N), N->getValueType(0), R, RHS, LHS);
} else {
R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
N->getOperand(0), N->getOperand(1),
N->getOperand(2));
// Replace VSELECT with non-mask conditions with BLENDV/VPTERNLOG.
assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
"We can't replace VSELECT with BLENDV in vXi16!");
if (Subtarget->hasVLX() &&
CurDAG->ComputeNumSignBits(Cond) == EleVT.getSizeInBits()) {
R = CurDAG->getNode(
X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0), Cond, LHS, RHS,
CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
} else {
R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
Cond, LHS, RHS);
}
}
--I;
CurDAG->ReplaceAllUsesWith(N, R.getNode());
Expand Down
32 changes: 18 additions & 14 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5415,6 +5415,20 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
}
}

// Match not(insert_subvector(undef, setcc(), c))
// --> insert_subvector(undef, not(setcc()), c)
if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
V.getOperand(1).getOpcode() == ISD::SETCC &&
V.getValueType().getScalarType() == MVT::i1) {
SDValue Cond = V.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
SDValue NotSub = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
Cond.getOperand(0), Cond.getOperand(1), CC);
return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(V), V.getValueType(),
V.getOperand(0), NotSub, V.getOperand(2));
}

// Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
SmallVector<SDValue, 2> CatOps;
if (collectConcatOps(V.getNode(), CatOps, DAG)) {
Expand Down Expand Up @@ -48049,19 +48063,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}

// Check if the first operand is all zeros and Cond type is vXi1.
// If this an avx512 target we can improve the use of zero masking by
// swapping the operands and inverting the condition.
if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
ISD::isBuildVectorAllZeros(LHS.getNode()) &&
!ISD::isBuildVectorAllZeros(RHS.getNode())) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
}

// Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
// get split by legalization.
if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
Expand Down Expand Up @@ -48125,11 +48126,14 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return V;

// select(~Cond, X, Y) -> select(Cond, Y, X)
if (CondVT.getScalarType() != MVT::i1) {
if (CondVT.getScalarType() != MVT::i1 ||
(ISD::isBuildVectorAllZeros(LHS.getNode()) &&
!ISD::isBuildVectorAllZeros(RHS.getNode())))
if (SDValue CondNot = IsNOT(Cond, DAG))
return DAG.getNode(N->getOpcode(), DL, VT,
DAG.getBitcast(CondVT, CondNot), RHS, LHS);

if (CondVT.getScalarType() != MVT::i1) {
// select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
if (Cond.getOpcode() == X86ISD::PCMPEQ &&
Cond.getOperand(0).getOpcode() == ISD::AND &&
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/X86/extract-vselect-setcc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ define void @PR117684(i1 %cond, <8 x float> %vec, ptr %ptr1, ptr %ptr2) #0 {
; CHECK-LABEL: PR117684:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vcmpnltss %xmm1, %xmm0, %k1
; CHECK-NEXT: vcmpltss %xmm1, %xmm0, %k0
; CHECK-NEXT: knotb %k0, %k1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems we should take care for scalar select too?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have a helper to inverse FCMP codes? We have one for swapping/commutation but I can't find anything for NOT(FCMP(X,Y,I)) -> FCMP(X,Y,NOT(I)).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't getSetCCInverseImpl work for FCMP?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, different encodings for CondCode

@topperc IIRC you did some work to clean this up?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to tentatively just flip using "CC ^ 4" but I'm not certain if all the inf/nan handling will be preserved?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"CC ^ 4" looks correct to me.

; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT: vinsertf32x4 $0, %xmm0, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/psubus.ll
Original file line number Diff line number Diff line change
Expand Up @@ -981,9 +981,9 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX512-LABEL: test14:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vpmovdb %zmm1, %xmm3
; AVX512-NEXT: vpcmpnltud %zmm2, %zmm1, %k1
; AVX512-NEXT: vpmovdb %zmm1, %xmm1
; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} {z}
; AVX512-NEXT: vpsubb %xmm0, %xmm3, %xmm0 {%k1} {z}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
vector.ph:
Expand Down
128 changes: 66 additions & 62 deletions llvm/test/CodeGen/X86/var-permute-128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -170,25 +170,26 @@ define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounw
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v2i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
Expand Down Expand Up @@ -355,24 +356,26 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
; AVX512-LABEL: var_shuffle_zero_v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v4i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpblendmd %xmm3, %xmm1, %xmm3 {%k1}
; AVX512VL-NEXT: vpermilps %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
Expand Down Expand Up @@ -600,12 +603,12 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v8i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1}
; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7]
; AVX512VL-NEXT: vpcmpnleuw %xmm2, %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpleuw %xmm2, %xmm1, %k2
; AVX512VL-NEXT: vmovdqu16 %xmm3, %xmm1 {%k1}
; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <8 x i16> %indices, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
%or = select <8 x i1> %cmp, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %indices
Expand Down Expand Up @@ -923,12 +926,12 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vmovdqu8 %xmm2, %xmm1 {%k1}
; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpcmpnleub %xmm2, %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpleub %xmm2, %xmm1, %k2
; AVX512VL-NEXT: vmovdqu8 %xmm3, %xmm1 {%k1}
; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <16 x i8> %indices, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
%or = select <16 x i1> %cmp, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %indices
Expand Down Expand Up @@ -1139,25 +1142,25 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices)
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vmovapd %zmm1, %zmm0 {%k1}
; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
; AVX512-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v2f64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
Expand Down Expand Up @@ -1324,24 +1327,25 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
; AVX512-LABEL: var_shuffle_zero_v4f32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
; AVX512-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v4f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k2
; AVX512VL-NEXT: vmovdqa32 %xmm3, %xmm1 {%k1}
; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
Expand Down
Loading