Skip to content

Commit dc7004b

Browse files
committed
[SelectionDAG] Deal with POISON for INSERT_VECTOR_ELT/INSERT_SUBVECTOR (part 1)
As reported in #141034 SelectionDAG::getNode had some unexpected behaviors when trying to create vectors with UNDEF elements. Since we treat both UNDEF and POISON as undefined (when using isUndef()) we can't just fold away INSERT_VECTOR_ELT/INSERT_SUBVECTOR based on isUndef(), as that could make the resulting vector more poisonous. Same kind of bug existed in DAGCombiner::visitINSERT_SUBVECTOR. Here are some examples: This fold was done even if vec[idx] was POISON: INSERT_VECTOR_ELT vec, UNDEF, idx -> vec This fold was done even if any of vec[idx..idx+size] was POISON: INSERT_SUBVECTOR vec, UNDEF, idx -> vec This fold was done even if the elements not extracted from vec could be POISON: sub = EXTRACT_SUBVECTOR vec, idx INSERT_SUBVECTOR UNDEF, sub, idx -> vec With this patch we avoid such folds unless we can prove that the result isn't more poisonous when eliminating the insert. This patch in itself result in some regressions. Goal is to try to deal with those regressions in follow up commits. Fixes #141034
1 parent a72bcda commit dc7004b

20 files changed

+1437
-928
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22955,6 +22955,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2295522955
auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
2295622956

2295722957
// Insert into out-of-bounds element is undefined.
22958+
// Code below relies on that we handle this special case early.
2295822959
if (IndexC && VT.isFixedLengthVector() &&
2295922960
IndexC->getZExtValue() >= VT.getVectorNumElements())
2296022961
return DAG.getUNDEF(VT);
@@ -22965,14 +22966,29 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2296522966
InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
2296622967
return InVec;
2296722968

22968-
if (!IndexC) {
22969-
// If this is variable insert to undef vector, it might be better to splat:
22970-
// inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
22971-
if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT))
22972-
return DAG.getSplat(VT, DL, InVal);
22973-
return SDValue();
22969+
// If this is variable insert to undef vector, it might be better to splat:
22970+
// inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
22971+
if (!IndexC && InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT))
22972+
return DAG.getSplat(VT, DL, InVal);
22973+
22974+
// Try to drop insert of UNDEF/POISON elements. This is also done in getNode,
22975+
// but we also do it as a DAG combine since for example simplifications into
22976+
// SPLAT_VECTOR/BUILD_VECTOR may turn poison elements into undef/zero etc, and
22977+
// then suddenly the InVec is guaranteed to not be poison.
22978+
if (InVal.isUndef()) {
22979+
if (IndexC && VT.isFixedLengthVector()) {
22980+
APInt EltMask = APInt::getOneBitSet(VT.getVectorNumElements(),
22981+
IndexC->getZExtValue());
22982+
if (DAG.isGuaranteedNotToBePoison(InVec, EltMask))
22983+
return InVec;
22984+
} else if (DAG.isGuaranteedNotToBePoison(InVec)) {
22985+
return InVec;
22986+
}
2297422987
}
2297522988

22989+
if (!IndexC)
22990+
return SDValue();
22991+
2297622992
if (VT.isScalableVector())
2297722993
return SDValue();
2297822994

@@ -27405,18 +27421,40 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
2740527421
SDValue N2 = N->getOperand(2);
2740627422
uint64_t InsIdx = N->getConstantOperandVal(2);
2740727423

27408-
// If inserting an UNDEF, just return the original vector.
27409-
if (N1.isUndef())
27410-
return N0;
27424+
// If inserting an UNDEF, just return the original vector (unless it makes the
27425+
// result more poisonous).
27426+
if (N1.isUndef()){
27427+
if (VT.isFixedLengthVector()) {
27428+
unsigned SubVecNumElts = N1.getValueType().getVectorNumElements();
27429+
APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(),
27430+
InsIdx, InsIdx + SubVecNumElts);
27431+
if (DAG.isGuaranteedNotToBePoison(N0, EltMask))
27432+
return N0;
27433+
} else if (DAG.isGuaranteedNotToBePoison(N0))
27434+
return N0;
27435+
}
2741127436

27412-
// If this is an insert of an extracted vector into an undef vector, we can
27413-
// just use the input to the extract if the types match, and can simplify
27437+
// If this is an insert of an extracted vector into an undef/poison vector, we
27438+
// can just use the input to the extract if the types match, and can simplify
2741427439
// in some cases even if they don't.
2741527440
if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
2741627441
N1.getOperand(1) == N2) {
27442+
EVT N1VT = N1.getValueType();
2741727443
EVT SrcVT = N1.getOperand(0).getValueType();
27418-
if (SrcVT == VT)
27419-
return N1.getOperand(0);
27444+
if (SrcVT == VT) {
27445+
// Need to ensure that result isn't more poisonous if skipping both the
27446+
// extract+insert.
27447+
if (N0.getOpcode() == ISD::POISON)
27448+
return N1.getOperand(0);
27449+
if (VT.isFixedLengthVector() && N1VT.isFixedLengthVector()) {
27450+
unsigned SubVecNumElts = N1VT.getVectorNumElements();
27451+
APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(),
27452+
InsIdx, InsIdx + SubVecNumElts);
27453+
if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0), ~EltMask))
27454+
return N1.getOperand(0);
27455+
} else if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0)))
27456+
return N1.getOperand(0);
27457+
}
2742027458
// TODO: To remove the zero check, need to adjust the offset to
2742127459
// a multiple of the new src type.
2742227460
if (isNullConstant(N2)) {

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7911,23 +7911,42 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
79117911
// INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except
79127912
// for scalable vectors where we will generate appropriate code to
79137913
// deal with out-of-bounds cases correctly.
7914-
if (N3C && N1.getValueType().isFixedLengthVector() &&
7915-
N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
7914+
if (N3C && VT.isFixedLengthVector() &&
7915+
N3C->getZExtValue() >= VT.getVectorNumElements())
79167916
return getUNDEF(VT);
79177917

79187918
// Undefined index can be assumed out-of-bounds, so that's UNDEF too.
79197919
if (N3.isUndef())
79207920
return getUNDEF(VT);
79217921

7922-
// If the inserted element is an UNDEF, just use the input vector.
7923-
if (N2.isUndef())
7922+
// If inserting poison, just use the input vector.
7923+
if (N2.getOpcode() == ISD::POISON)
79247924
return N1;
79257925

7926+
// Inserting undef into undef/poison is still undef.
7927+
if (N2.getOpcode() == ISD::UNDEF && N1.isUndef())
7928+
return getUNDEF(VT);
7929+
7930+
// If the inserted element is an UNDEF, just use the input vector.
7931+
// But not if skipping the insert could make the result more poisonous.
7932+
if (N2.isUndef()) {
7933+
if (N3C && VT.isFixedLengthVector()) {
7934+
APInt EltMask = APInt::getOneBitSet(VT.getVectorNumElements(),
7935+
N3C->getZExtValue());
7936+
if (isGuaranteedNotToBePoison(N1, EltMask))
7937+
return N1;
7938+
} else if (isGuaranteedNotToBePoison(N1))
7939+
return N1;
7940+
}
79267941
break;
79277942
}
79287943
case ISD::INSERT_SUBVECTOR: {
7929-
// Inserting undef into undef is still undef.
7930-
if (N1.isUndef() && N2.isUndef())
7944+
// If inserting poison, just use the input vector,
7945+
if (N2.getOpcode() == ISD::POISON)
7946+
return N1;
7947+
7948+
// Inserting undef into undef/poison is still undef.
7949+
if (N2.getOpcode() == ISD::UNDEF && N1.isUndef())
79317950
return getUNDEF(VT);
79327951

79337952
EVT N2VT = N2.getValueType();
@@ -7956,11 +7975,37 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
79567975
if (VT == N2VT)
79577976
return N2;
79587977

7959-
// If this is an insert of an extracted vector into an undef vector, we
7960-
// can just use the input to the extract.
7978+
// If this is an insert of an extracted vector into an undef/poison vector,
7979+
// we can just use the input to the extract. But not if skipping the
7980+
// extract+insert could make the result more poisonous.
79617981
if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7962-
N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT)
7963-
return N2.getOperand(0);
7982+
N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT) {
7983+
if (N1.getOpcode() == ISD::POISON)
7984+
return N2.getOperand(0);
7985+
if (VT.isFixedLengthVector() && N2VT.isFixedLengthVector()) {
7986+
unsigned LoBit = N3->getAsZExtVal();
7987+
unsigned HiBit = LoBit + N2VT.getVectorNumElements();
7988+
APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(),
7989+
LoBit, HiBit);
7990+
if (isGuaranteedNotToBePoison(N2.getOperand(0), ~EltMask))
7991+
return N2.getOperand(0);
7992+
} else if (isGuaranteedNotToBePoison(N2.getOperand(0)))
7993+
return N2.getOperand(0);
7994+
}
7995+
7996+
// If the inserted subvector is UNDEF, just use the input vector.
7997+
// But not if skipping the insert could make the result more poisonous.
7998+
if (N2.isUndef()) {
7999+
if (VT.isFixedLengthVector()) {
8000+
unsigned LoBit = N3->getAsZExtVal();
8001+
unsigned HiBit = LoBit + N2VT.getVectorNumElements();
8002+
APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(),
8003+
LoBit, HiBit);
8004+
if (isGuaranteedNotToBePoison(N1, EltMask))
8005+
return N1;
8006+
} else if (isGuaranteedNotToBePoison(N1))
8007+
return N1;
8008+
}
79648009
break;
79658010
}
79668011
case ISD::BITCAST:

llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,10 @@ define void @insert_vec_v3i16_uaddlv_from_v8i16(ptr %0) {
229229
; CHECK: ; %bb.0: ; %entry
230230
; CHECK-NEXT: movi.2d v0, #0000000000000000
231231
; CHECK-NEXT: movi.2d v1, #0000000000000000
232-
; CHECK-NEXT: add x8, x0, #8
233232
; CHECK-NEXT: uaddlv.8h s0, v0
234233
; CHECK-NEXT: mov.h v1[0], v0[0]
234+
; CHECK-NEXT: mov.h v1[3], w8
235+
; CHECK-NEXT: add x8, x0, #8
235236
; CHECK-NEXT: ushll.4s v1, v1, #0
236237
; CHECK-NEXT: ucvtf.4s v1, v1
237238
; CHECK-NEXT: st1.s { v1 }[2], [x8]

llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base_skip8(i32 %a0) {
8484
; CHECK-NEXT: lsr w8, w0, #5
8585
; CHECK-NEXT: dup.16b v0, w8
8686
; CHECK-NEXT: mov.b v0[5], wzr
87+
; CHECK-NEXT: mov.b v0[8], w8
8788
; CHECK-NEXT: mov.b v0[9], wzr
8889
; CHECK-NEXT: ret
8990
%a1 = lshr exact i32 %a0, 5
@@ -144,6 +145,7 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_values_skip8(i
144145
; CHECK-NEXT: mov.b v0[2], w8
145146
; CHECK-NEXT: mov.b v0[5], wzr
146147
; CHECK-NEXT: mov.b v0[7], w8
148+
; CHECK-NEXT: mov.b v0[8], w8
147149
; CHECK-NEXT: mov.b v0[9], wzr
148150
; CHECK-NEXT: mov.b v0[12], w8
149151
; CHECK-NEXT: mov.b v0[15], w8

llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
3737
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
3838
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
3939
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
40+
; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
41+
; CHECK-NEXT: ptrue p1.h
42+
; CHECK-NEXT: and z2.h, z2.h, #0x1
43+
; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
4044
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
4145
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
4246
; CHECK-NEXT: ret
@@ -59,8 +63,15 @@ define void @select_v32f16(ptr %a, ptr %b) #0 {
5963
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
6064
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
6165
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z2.h, z3.h
62-
; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h
63-
; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h
66+
; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff
67+
; VBITS_GE_256-NEXT: ptrue p1.h
68+
; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff
69+
; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1
70+
; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1
71+
; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0
72+
; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0
73+
; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h
74+
; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h
6475
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
6576
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
6677
; VBITS_GE_256-NEXT: ret
@@ -71,6 +82,10 @@ define void @select_v32f16(ptr %a, ptr %b) #0 {
7182
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
7283
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
7384
; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
85+
; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
86+
; VBITS_GE_512-NEXT: ptrue p1.h
87+
; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1
88+
; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0
7489
; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h
7590
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
7691
; VBITS_GE_512-NEXT: ret
@@ -89,6 +104,10 @@ define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
89104
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
90105
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
91106
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
107+
; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
108+
; CHECK-NEXT: ptrue p1.h
109+
; CHECK-NEXT: and z2.h, z2.h, #0x1
110+
; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
92111
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
93112
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
94113
; CHECK-NEXT: ret
@@ -107,6 +126,10 @@ define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
107126
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
108127
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
109128
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
129+
; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
130+
; CHECK-NEXT: ptrue p1.h
131+
; CHECK-NEXT: and z2.h, z2.h, #0x1
132+
; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
110133
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
111134
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
112135
; CHECK-NEXT: ret
@@ -150,6 +173,10 @@ define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
150173
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
151174
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
152175
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
176+
; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
177+
; CHECK-NEXT: ptrue p1.s
178+
; CHECK-NEXT: and z2.s, z2.s, #0x1
179+
; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
153180
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
154181
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
155182
; CHECK-NEXT: ret
@@ -172,8 +199,15 @@ define void @select_v16f32(ptr %a, ptr %b) #0 {
172199
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
173200
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
174201
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s
175-
; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s
176-
; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s
202+
; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff
203+
; VBITS_GE_256-NEXT: ptrue p1.s
204+
; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff
205+
; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1
206+
; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1
207+
; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0
208+
; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0
209+
; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s
210+
; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s
177211
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
178212
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
179213
; VBITS_GE_256-NEXT: ret
@@ -184,6 +218,10 @@ define void @select_v16f32(ptr %a, ptr %b) #0 {
184218
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
185219
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
186220
; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
221+
; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
222+
; VBITS_GE_512-NEXT: ptrue p1.s
223+
; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1
224+
; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0
187225
; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s
188226
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
189227
; VBITS_GE_512-NEXT: ret
@@ -202,6 +240,10 @@ define void @select_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
202240
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
203241
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
204242
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
243+
; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
244+
; CHECK-NEXT: ptrue p1.s
245+
; CHECK-NEXT: and z2.s, z2.s, #0x1
246+
; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
205247
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
206248
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
207249
; CHECK-NEXT: ret
@@ -220,6 +262,10 @@ define void @select_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
220262
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
221263
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
222264
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
265+
; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
266+
; CHECK-NEXT: ptrue p1.s
267+
; CHECK-NEXT: and z2.s, z2.s, #0x1
268+
; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
223269
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
224270
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
225271
; CHECK-NEXT: ret
@@ -264,6 +310,10 @@ define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
264310
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
265311
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
266312
; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
313+
; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
314+
; CHECK-NEXT: ptrue p1.d
315+
; CHECK-NEXT: and z2.d, z2.d, #0x1
316+
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
267317
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
268318
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
269319
; CHECK-NEXT: ret
@@ -286,8 +336,15 @@ define void @select_v8f64(ptr %a, ptr %b) #0 {
286336
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
287337
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
288338
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d
289-
; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d
290-
; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d
339+
; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff
340+
; VBITS_GE_256-NEXT: ptrue p1.d
341+
; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff
342+
; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1
343+
; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1
344+
; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0
345+
; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0
346+
; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d
347+
; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d
291348
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
292349
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
293350
; VBITS_GE_256-NEXT: ret
@@ -298,6 +355,10 @@ define void @select_v8f64(ptr %a, ptr %b) #0 {
298355
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
299356
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
300357
; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
358+
; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
359+
; VBITS_GE_512-NEXT: ptrue p1.d
360+
; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1
361+
; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0
301362
; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
302363
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
303364
; VBITS_GE_512-NEXT: ret
@@ -316,6 +377,10 @@ define void @select_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
316377
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
317378
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
318379
; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
380+
; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
381+
; CHECK-NEXT: ptrue p1.d
382+
; CHECK-NEXT: and z2.d, z2.d, #0x1
383+
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
319384
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
320385
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
321386
; CHECK-NEXT: ret
@@ -334,6 +399,10 @@ define void @select_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
334399
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
335400
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
336401
; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
402+
; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
403+
; CHECK-NEXT: ptrue p1.d
404+
; CHECK-NEXT: and z2.d, z2.d, #0x1
405+
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
337406
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
338407
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
339408
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)