Skip to content

Commit 95cd1b0

Browse files
committed
Work to fix regressions in integer select srcmod generation when v2i32
is made legal for or/xor/and. Complete fix of v2i32 in VOP SrcMod placement.
1 parent 3f452a1 commit 95cd1b0

File tree

4 files changed

+219
-159
lines changed

4 files changed

+219
-159
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3059,36 +3059,62 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
30593059
Src = Src.getOperand(0);
30603060
}
30613061

3062+
// v2i32 xor/or/and are legal. A vselect using these instructions as operands
3063+
// is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3064+
// through this extract if possible.
3065+
auto getVectorBitWiseOp = [](SDValue S) -> SDValue {
3066+
if (S->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
3067+
SDValue VecOp = S->getOperand(0);
3068+
if (VecOp.getOpcode() == ISD::XOR || VecOp.getOpcode() == ISD::AND ||
3069+
VecOp.getOpcode() == ISD::OR)
3070+
return VecOp;
3071+
}
3072+
return SDValue();
3073+
};
3074+
3075+
SDValue Vec = getVectorBitWiseOp(Src);
3076+
SDValue BWSrc = Vec ? Vec : Src;
30623077
// Convert various sign-bit masks to src mods. Currently disabled for 16-bit
30633078
// types as the codegen replaces the operand without adding a srcmod.
30643079
// This is intentionally finding the cases where we are performing float neg
30653080
// and abs on int types, the goal is not to obtain two's complement neg or
30663081
// abs.
30673082
// TODO: Add 16-bit support.
3068-
unsigned Opc = Src->getOpcode();
3083+
unsigned Opc = Vec ? Vec->getOpcode() : Src->getOpcode();
30693084
EVT VT = Src.getValueType();
30703085
if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
30713086
(VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
30723087
return true;
30733088

3074-
ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Src->getOperand(1));
3089+
ConstantSDNode *CRHS =
3090+
isConstOrConstSplat(Vec ? Vec->getOperand(1) : Src->getOperand(1));
30753091
if (!CRHS)
30763092
return true;
30773093

3094+
auto ReplaceSrc = [&]() -> SDValue {
3095+
if (Vec) {
3096+
SDValue LHS = BWSrc->getOperand(0);
3097+
SDValue Index = Src->getOperand(1);
3098+
return Src = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
3099+
Src.getValueType(), LHS, Index);
3100+
}
3101+
return Src = BWSrc.getOperand(0);
3102+
};
3103+
30783104
// Recognise (xor a, 0x80000000) as NEG SrcMod.
30793105
// Recognise (and a, 0x7fffffff) as ABS SrcMod.
30803106
// Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers.
30813107
if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
30823108
Mods |= SISrcMods::NEG;
3083-
Src = Src.getOperand(0);
3109+
Src = ReplaceSrc();
30843110
} else if (Opc == ISD::AND && AllowAbs &&
30853111
CRHS->getAPIntValue().isMaxSignedValue()) {
30863112
Mods |= SISrcMods::ABS;
3087-
Src = Src.getOperand(0);
3113+
Src = ReplaceSrc();
30883114
} else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
30893115
Mods |= SISrcMods::ABS;
30903116
Mods |= SISrcMods::NEG;
3091-
Src = Src.getOperand(0);
3117+
Src = ReplaceSrc();
30923118
}
30933119

30943120
return true;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 162 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -13361,43 +13361,31 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1336113361
SDValue LHS = N->getOperand(0);
1336213362
SDValue RHS = N->getOperand(1);
1336313363

13364-
// Fold the fneg of a vselect into the v2 vselect operands.
13365-
// xor (vselect c, a, b), 0x80000000 ->
13366-
// bitcast (vselect c, (fneg (bitcast a)), (fneg (bitcast b)))
13367-
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
13368-
13369-
const ConstantSDNode *CRHS0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
13370-
const ConstantSDNode *CRHS1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13371-
SDValue LHS_0 = LHS.getOperand(0);
13372-
SDValue LHS_1 = LHS.getOperand(1);
13373-
13374-
if (LHS.getOpcode() == ISD::VSELECT && CRHS0 &&
13375-
CRHS0->getAPIntValue().isSignMask() &&
13376-
shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 &&
13377-
CRHS1->getAPIntValue().isSignMask() &&
13378-
shouldFoldFNegIntoSrc(N, LHS_1)) {
13379-
13380-
SDLoc DL(N);
13381-
SDValue CastLHS =
13382-
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
13383-
SDValue CastRHS =
13384-
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
13385-
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
13386-
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
13387-
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
13388-
LHS->getOperand(0), FNegLHS, FNegRHS);
13389-
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13390-
}
13391-
}
13392-
13393-
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13364+
const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
1339413365

1339513366
if (CRHS && VT == MVT::i64) {
1339613367
if (SDValue Split =
1339713368
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
1339813369
return Split;
1339913370
}
1340013371

13372+
// v2i32 (xor (vselect cc, x, y), K) ->
13373+
// (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
13374+
// replaced with source modifiers when the select is lowered to CNDMASK.
13375+
// TODO REMOVE: prevents regressions in fneg-modifier-casting.ll
13376+
unsigned Opc = LHS.getOpcode();
13377+
if(((Opc == ISD::VSELECT && VT==MVT::v2i32) || (Opc == ISD::SELECT && VT==MVT::i64)) && CRHS && CRHS->getAPIntValue().isSignMask()) {
13378+
SDValue CC = LHS->getOperand(0);
13379+
SDValue TRUE = LHS->getOperand(1);
13380+
SDValue FALSE = LHS->getOperand(2);
13381+
SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
13382+
SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
13383+
SDValue XSelect = DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
13384+
return XSelect;
13385+
}
13386+
13387+
13388+
1340113389
// Make sure to apply the 64-bit constant splitting fold before trying to fold
1340213390
// fneg-like xors into 64-bit select.
1340313391
if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
@@ -14362,125 +14350,165 @@ bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
1436214350
EltSize, NumElem, Idx->isDivergent(), getSubtarget());
1436314351
}
1436414352

14365-
SDValue
14366-
SITargetLowering::performExtractVectorEltCombine(SDNode *N,
14367-
DAGCombinerInfo &DCI) const {
14368-
SDValue Vec = N->getOperand(0);
14369-
SelectionDAG &DAG = DCI.DAG;
14353+
// SDValue
14354+
// SITargetLowering::performBuildVectorCombine(SDNode *N,
14355+
// DAGCombinerInfo &DCI) const {
14356+
// // if (N->use_empty())
14357+
// // return SDValue();
1437014358

14371-
EVT VecVT = Vec.getValueType();
14372-
EVT VecEltVT = VecVT.getVectorElementType();
14373-
EVT ResVT = N->getValueType(0);
14359+
// // if(!N->getValueType(0).isFloatingPoint())
14360+
// // return SDValue();
1437414361

14375-
unsigned VecSize = VecVT.getSizeInBits();
14376-
unsigned VecEltSize = VecEltVT.getSizeInBits();
14362+
// // SelectionDAG &DAG = DCI.DAG;
1437714363

14378-
if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
14379-
allUsesHaveSourceMods(N)) {
14380-
SDLoc SL(N);
14381-
SDValue Idx = N->getOperand(1);
14382-
SDValue Elt =
14383-
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
14384-
return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
14385-
}
14386-
14387-
// ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
14388-
// =>
14389-
// Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
14390-
// Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
14391-
// ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
14392-
if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14393-
SDLoc SL(N);
14394-
SDValue Idx = N->getOperand(1);
14395-
unsigned Opc = Vec.getOpcode();
14364+
// // // Iterate the operands. Check if source modifier. If so, propogate the
14365+
// // source
14366+
// // // modifier to the user and the srcmod from the BUILD_VECTOR element.
14367+
// // for (unsigned I = 0; I < N->getNumOperands(); I++) {
14368+
// // SDValue E = N->getOperand(I);
14369+
// // if (E->getOpcode() != ISD::FNEG && E->getOpcode() != ISD::ABS)
14370+
// // continue;
1439614371

14397-
switch (Opc) {
14398-
default:
14399-
break;
14400-
// TODO: Support other binary operations.
14401-
case ISD::FADD:
14402-
case ISD::FSUB:
14403-
case ISD::FMUL:
14404-
case ISD::ADD:
14405-
case ISD::UMIN:
14406-
case ISD::UMAX:
14407-
case ISD::SMIN:
14408-
case ISD::SMAX:
14409-
case ISD::FMAXNUM:
14410-
case ISD::FMINNUM:
14411-
case ISD::FMAXNUM_IEEE:
14412-
case ISD::FMINNUM_IEEE:
14413-
case ISD::FMAXIMUM:
14414-
case ISD::FMINIMUM: {
14415-
SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14416-
Vec.getOperand(0), Idx);
14417-
SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14418-
Vec.getOperand(1), Idx);
14419-
14420-
DCI.AddToWorklist(Elt0.getNode());
14421-
DCI.AddToWorklist(Elt1.getNode());
14422-
return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
14423-
}
14424-
}
14425-
}
14426-
14427-
// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
14428-
if (shouldExpandVectorDynExt(N)) {
14429-
SDLoc SL(N);
14430-
SDValue Idx = N->getOperand(1);
14431-
SDValue V;
14432-
for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14433-
SDValue IC = DAG.getVectorIdxConstant(I, SL);
14434-
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
14435-
if (I == 0)
14436-
V = Elt;
14437-
else
14438-
V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
14372+
// // // Users through which we can propogate will include users of
14373+
// // // extract_element on this vector, so need to peek-through.
14374+
// // }
14375+
14376+
// // SmallVector<SDNode*, 4> UsersToModify;
14377+
14378+
// // // If the use of the BUILD_VECTOR supports source mods it can be
14379+
// // propogated. for (SDNode *U : N->users()) {
14380+
// // if(!U->getOpcode() == ISD::EXTRACT_VECTOR_ELT)
14381+
// // if (!allUsesHaveSourceMods(U))
14382+
// // continue;
14383+
// // UsersToModify.push_back(U);
14384+
// // }
14385+
14386+
// // for(auto Node: UsersToModify) {
14387+
14388+
// // }
14389+
14390+
// return SDValue();
14391+
// }
14392+
14393+
SDValue SITargetLowering::performExtractVectorEltCombine(
14394+
SDNode * N, DAGCombinerInfo & DCI) const {
14395+
SDValue Vec = N->getOperand(0);
14396+
SelectionDAG &DAG = DCI.DAG;
14397+
14398+
EVT VecVT = Vec.getValueType();
14399+
EVT VecEltVT = VecVT.getVectorElementType();
14400+
EVT ResVT = N->getValueType(0);
14401+
14402+
unsigned VecSize = VecVT.getSizeInBits();
14403+
unsigned VecEltSize = VecEltVT.getSizeInBits();
14404+
14405+
if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
14406+
allUsesHaveSourceMods(N)) {
14407+
SDLoc SL(N);
14408+
SDValue Idx = N->getOperand(1);
14409+
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14410+
Vec.getOperand(0), Idx);
14411+
return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
14412+
}
14413+
14414+
// ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
14415+
// =>
14416+
// Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
14417+
// Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
14418+
// ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
14419+
if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14420+
SDLoc SL(N);
14421+
SDValue Idx = N->getOperand(1);
14422+
unsigned Opc = Vec.getOpcode();
14423+
14424+
switch (Opc) {
14425+
default:
14426+
break;
14427+
// TODO: Support other binary operations.
14428+
case ISD::FADD:
14429+
case ISD::FSUB:
14430+
case ISD::FMUL:
14431+
case ISD::ADD:
14432+
case ISD::UMIN:
14433+
case ISD::UMAX:
14434+
case ISD::SMIN:
14435+
case ISD::SMAX:
14436+
case ISD::FMAXNUM:
14437+
case ISD::FMINNUM:
14438+
case ISD::FMAXNUM_IEEE:
14439+
case ISD::FMINNUM_IEEE:
14440+
case ISD::FMAXIMUM:
14441+
case ISD::FMINIMUM: {
14442+
SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14443+
Vec.getOperand(0), Idx);
14444+
SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14445+
Vec.getOperand(1), Idx);
14446+
14447+
DCI.AddToWorklist(Elt0.getNode());
14448+
DCI.AddToWorklist(Elt1.getNode());
14449+
return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
14450+
}
14451+
}
1443914452
}
14440-
return V;
14441-
}
1444214453

14443-
if (!DCI.isBeforeLegalize())
14444-
return SDValue();
14454+
// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
14455+
if (shouldExpandVectorDynExt(N)) {
14456+
SDLoc SL(N);
14457+
SDValue Idx = N->getOperand(1);
14458+
SDValue V;
14459+
for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14460+
SDValue IC = DAG.getVectorIdxConstant(I, SL);
14461+
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
14462+
if (I == 0)
14463+
V = Elt;
14464+
else
14465+
V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
14466+
}
14467+
return V;
14468+
}
1444514469

14446-
// Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
14447-
// elements. This exposes more load reduction opportunities by replacing
14448-
// multiple small extract_vector_elements with a single 32-bit extract.
14449-
auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
14450-
if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
14451-
VecSize > 32 && VecSize % 32 == 0 && Idx) {
14452-
EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
14453-
14454-
unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14455-
unsigned EltIdx = BitIndex / 32;
14456-
unsigned LeftoverBitIdx = BitIndex % 32;
14457-
SDLoc SL(N);
14470+
if (!DCI.isBeforeLegalize())
14471+
return SDValue();
1445814472

14459-
SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
14460-
DCI.AddToWorklist(Cast.getNode());
14473+
// Try to turn sub-dword accesses of vectors into accesses of the same
14474+
// 32-bit elements. This exposes more load reduction opportunities by
14475+
// replacing multiple small extract_vector_elements with a single 32-bit
14476+
// extract.
14477+
auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
14478+
if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
14479+
VecSize > 32 && VecSize % 32 == 0 && Idx) {
14480+
EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
14481+
14482+
unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14483+
unsigned EltIdx = BitIndex / 32;
14484+
unsigned LeftoverBitIdx = BitIndex % 32;
14485+
SDLoc SL(N);
1446114486

14462-
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
14463-
DAG.getConstant(EltIdx, SL, MVT::i32));
14464-
DCI.AddToWorklist(Elt.getNode());
14465-
SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
14466-
DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
14467-
DCI.AddToWorklist(Srl.getNode());
14487+
SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
14488+
DCI.AddToWorklist(Cast.getNode());
1446814489

14469-
EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
14470-
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
14471-
DCI.AddToWorklist(Trunc.getNode());
14490+
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
14491+
DAG.getConstant(EltIdx, SL, MVT::i32));
14492+
DCI.AddToWorklist(Elt.getNode());
14493+
SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
14494+
DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
14495+
DCI.AddToWorklist(Srl.getNode());
1447214496

14473-
if (VecEltVT == ResVT) {
14474-
return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14497+
EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
14498+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
14499+
DCI.AddToWorklist(Trunc.getNode());
14500+
14501+
if (VecEltVT == ResVT) {
14502+
return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14503+
}
14504+
14505+
assert(ResVT.isScalarInteger());
14506+
return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
1447514507
}
1447614508

14477-
assert(ResVT.isScalarInteger());
14478-
return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
14509+
return SDValue();
1447914510
}
1448014511

14481-
return SDValue();
14482-
}
14483-
1448414512
SDValue
1448514513
SITargetLowering::performInsertVectorEltCombine(SDNode *N,
1448614514
DAGCombinerInfo &DCI) const {

0 commit comments

Comments
 (0)