Skip to content

Commit e86a2a1

Browse files
committed
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of
64-bit wide instructions Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these causes a number of test regressions, so extra work in the combiner and Tablegen patterns was necessary. - Use custom for v2i32 rotr instead of additional patterns. Modify PerformOrCombine() to remove some identity or operations - Fix rotr regression by adding lowerRotr() on the legalizer codepath. - Add test case to rotr.ll - Extend performFNEGCombine() for the SELECT case. - Modify performSelectCombine() and foldFreeOpFromSelect to prevent the performFNEGCombine() changes from being unwound. - Add cases to or.ll and xor.ll to demonstrate the generation of the s_or_64 and s_xor_64 instructions for the v2i32 cases. Previously this was inhibited by "-amdgpu-scalarize-global-loads=false". - Fix shl/srl64_reduce regression by performing the scalarisation previously performewd by the vector legaliser in the combiner.
1 parent a41c1e1 commit e86a2a1

16 files changed

+1805
-218
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 136 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4031,9 +4031,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
40314031
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
40324032
/// binary operation \p Opc to it with the corresponding constant operands.
40334033
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4034-
DAGCombinerInfo &DCI, const SDLoc &SL,
4035-
unsigned Opc, SDValue LHS,
4036-
uint32_t ValLo, uint32_t ValHi) const {
4034+
DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4035+
uint32_t ValLo, uint32_t ValHi) const {
40374036
SelectionDAG &DAG = DCI.DAG;
40384037
SDValue Lo, Hi;
40394038
std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
@@ -4062,6 +4061,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40624061
SDLoc SL(N);
40634062
SelectionDAG &DAG = DCI.DAG;
40644063

4064+
// When the shl64_reduce optimisation code is passed through vector
4065+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4066+
// resulted in the AND instructions no longer being elided, as mentioned
4067+
// below. The following code should make sure this takes place.
4068+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4069+
SDValue VAND = RHS.getOperand(0);
4070+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4071+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4072+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4073+
SDValue LHSAND = VAND.getOperand(0);
4074+
SDValue RHSAND = VAND.getOperand(1);
4075+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4076+
// Part of shlcombine is to optimise for the case where its possible
4077+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4078+
// transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4079+
// '&' is then elided by ISel. The vector code for this was being
4080+
// completely scalarised by the vector legalizer, but now v2i32 is
4081+
// made legal the vector legaliser only partially scalarises the
4082+
// vector operations and the and was not elided. This check enables us
4083+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4084+
// the and instruction.
4085+
ConstantSDNode *CANDL =
4086+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4087+
ConstantSDNode *CANDR =
4088+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4089+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4090+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4091+
// Get the non-const AND operands and produce scalar AND
4092+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4093+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4094+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4095+
LHSAND, Zero);
4096+
SDValue Hi =
4097+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4098+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4099+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4100+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4101+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4102+
if (AndIndex == 0 || AndIndex == 1)
4103+
return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc,
4104+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4105+
}
4106+
}
4107+
}
4108+
}
4109+
}
4110+
40654111
unsigned RHSVal;
40664112
if (CRHS) {
40674113
RHSVal = CRHS->getZExtValue();
@@ -4103,8 +4149,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
41034149
if (VT.getScalarType() != MVT::i64)
41044150
return SDValue();
41054151

4106-
// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4107-
41084152
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
41094153
// common case, splitting this into a move and a 32-bit shift is faster and
41104154
// the same code size.
@@ -4266,6 +4310,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
42664310
SDLoc SL(N);
42674311
unsigned RHSVal;
42684312

4313+
// When the shl64_reduce optimisation code is passed through vector
4314+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4315+
// resulted in the AND instructions no longer being elided, as mentioned
4316+
// below. The following code should make sure this takes place.
4317+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4318+
SDValue VAND = RHS.getOperand(0);
4319+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4320+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4321+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4322+
SDValue LHSAND = VAND.getOperand(0);
4323+
SDValue RHSAND = VAND.getOperand(1);
4324+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4325+
// Part of srlcombine is to optimise for the case where its possible
4326+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4327+
// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4328+
// '&' is then elided by ISel. The vector code for this was being
4329+
// completely scalarised by the vector legalizer, but now v2i32 is
4330+
// made legal the vector legaliser only partially scalarises the
4331+
// vector operations and the and was not elided. This check enables us
4332+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4333+
// the and instruction.
4334+
ConstantSDNode *CANDL =
4335+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4336+
ConstantSDNode *CANDR =
4337+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4338+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4339+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4340+
// Get the non-const AND operands and produce scalar AND
4341+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4342+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4343+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4344+
LHSAND, Zero);
4345+
SDValue Hi =
4346+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4347+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4348+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4349+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4350+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4351+
if (AndIndex == 0 || AndIndex == 1)
4352+
return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
4353+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4354+
}
4355+
}
4356+
}
4357+
}
4358+
}
4359+
42694360
if (CRHS) {
42704361
RHSVal = CRHS->getZExtValue();
42714362

@@ -4779,8 +4870,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
47794870
if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
47804871
return SDValue();
47814872

4782-
return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4783-
SDLoc(N), Cond, LHS, RHS);
4873+
// select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4874+
// lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4875+
// out in this case. For now I've made the logic as specific to the case as
4876+
// possible, hopefully this can be relaxed in future.
4877+
if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) {
4878+
SDValue LHSB = LHS.getOperand(0);
4879+
SDValue RHSB = RHS.getOperand(0);
4880+
if (LHSB.getOpcode() == ISD::BITCAST &&
4881+
RHSB->getOpcode() == ISD::BITCAST) {
4882+
EVT LHSBOpTy = LHSB->getOperand(0).getValueType();
4883+
EVT RHSBOpTy = RHSB->getOperand(0).getValueType();
4884+
if (LHSB.getValueType() == MVT::f32 &&
4885+
RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 &&
4886+
RHSBOpTy == MVT::i32)
4887+
return SDValue();
4888+
}
4889+
}
4890+
4891+
return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
4892+
RHS);
47844893
}
47854894

47864895
bool Inv = false;
@@ -4833,8 +4942,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
48334942
if (Inv)
48344943
std::swap(NewLHS, NewRHS);
48354944

4836-
SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4837-
Cond, NewLHS, NewRHS);
4945+
SDValue NewSelect =
4946+
DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
48384947
DCI.AddToWorklist(NewSelect.getNode());
48394948
return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
48404949
}
@@ -5172,8 +5281,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
51725281
}
51735282
case ISD::SELECT: {
51745283
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5284+
// This combine became necessary recently to prevent a regression in
5285+
// fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5286+
// Specifically, additional instructions were added to the final codegen.
5287+
// When adding this combine a case was added to performFNEGCombine to
5288+
// prevent this combine from being undone under certain conditions.
51755289
// TODO: Invert conditions of foldFreeOpFromSelect
5176-
return SDValue();
5290+
SDValue Cond = N0.getOperand(0);
5291+
SDValue LHS = N0.getOperand(1);
5292+
SDValue RHS = N0.getOperand(2);
5293+
EVT LHVT = LHS.getValueType();
5294+
EVT RHVT = RHS.getValueType();
5295+
// The regression was limited to i32 v2/i32.
5296+
if (RHVT != MVT::i32 && LHVT != MVT::i32)
5297+
return SDValue();
5298+
5299+
SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
5300+
SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);
5301+
SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5302+
return Op;
51775303
}
51785304
case ISD::BITCAST: {
51795305
SDLoc SL(N);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
438438
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
439439
}
440440

441+
setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
442+
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
443+
// instead lower to cndmask in SITargetLowering::LowerSELECT().
444+
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
445+
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
446+
// alignbit.
447+
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
448+
441449
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
442450
Custom);
443451

@@ -6045,6 +6053,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
60456053
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
60466054
}
60476055

6056+
// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6057+
// regression whereby extra unnecessary instructions were added to codegen
6058+
// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6059+
// instructions to extract the result from the vector.
6060+
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
6061+
[[maybe_unused]] EVT VT = Op.getValueType();
6062+
6063+
assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6064+
VT == MVT::v16i32) &&
6065+
"Unexpected ValueType.");
6066+
6067+
return DAG.UnrollVectorOp(Op.getNode());
6068+
}
6069+
60486070
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
60496071
// wider vector type is legal.
60506072
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6236,6 +6258,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
62366258
return lowerGET_FPENV(Op, DAG);
62376259
case ISD::SET_FPENV:
62386260
return lowerSET_FPENV(Op, DAG);
6261+
case ISD::ROTR:
6262+
return lowerROTR(Op, DAG);
62396263
}
62406264
return SDValue();
62416265
}
@@ -13217,6 +13241,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1321713241
}
1321813242
}
1321913243

13244+
// Detect identity v2i32 OR and replace with identity source node.
13245+
// Specifically an Or that has operands constructed from the same source node
13246+
// via extract_vector_elt and build_vector. I.E.
13247+
// v2i32 or(
13248+
// v2i32 build_vector(
13249+
// i32 extract_elt(%IdentitySrc, 0),
13250+
// i32 0
13251+
// ),
13252+
// v2i32 build_vector(
13253+
// i32 0,
13254+
// i32 extract_elt(%IdentitySrc, 1)
13255+
// ) )
13256+
// =>
13257+
// v2i32 %IdentitySrc
13258+
13259+
if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
13260+
RHS->getOpcode() == ISD::BUILD_VECTOR) {
13261+
13262+
ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13263+
ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
13264+
13265+
// Test for and normalise build vectors.
13266+
if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
13267+
13268+
// Get the extract_vector_element operands.
13269+
SDValue LEVE = LHS->getOperand(0);
13270+
SDValue REVE = RHS->getOperand(1);
13271+
13272+
if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
13273+
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13274+
// Check that different elements from the same vector are
13275+
// extracted.
13276+
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
13277+
LEVE->getOperand(1) != REVE->getOperand(1)) {
13278+
SDValue IdentitySrc = LEVE.getOperand(0);
13279+
return IdentitySrc;
13280+
}
13281+
}
13282+
}
13283+
}
13284+
1322013285
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
1322113286
return SDValue();
1322213287

@@ -13261,13 +13326,39 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1326113326
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
1326213327
return RV;
1326313328

13329+
SelectionDAG &DAG = DCI.DAG;
13330+
EVT VT = N->getValueType(0);
1326413331
SDValue LHS = N->getOperand(0);
1326513332
SDValue RHS = N->getOperand(1);
1326613333

13334+
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
13335+
13336+
const ConstantSDNode *CRHS0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
13337+
const ConstantSDNode *CRHS1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13338+
SDValue LHS_0 = LHS.getOperand(0);
13339+
SDValue LHS_1 = LHS.getOperand(1);
13340+
13341+
if (LHS.getOpcode() == ISD::VSELECT && CRHS0 &&
13342+
CRHS0->getAPIntValue().isSignMask() &&
13343+
shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 &&
13344+
CRHS1->getAPIntValue().isSignMask() &&
13345+
shouldFoldFNegIntoSrc(N, LHS_1)) {
13346+
13347+
SDLoc DL(N);
13348+
SDValue CastLHS =
13349+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
13350+
SDValue CastRHS =
13351+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
13352+
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
13353+
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
13354+
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
13355+
LHS->getOperand(0), FNegLHS, FNegRHS);
13356+
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13357+
}
13358+
}
13359+
1326713360
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13268-
SelectionDAG &DAG = DCI.DAG;
1326913361

13270-
EVT VT = N->getValueType(0);
1327113362
if (CRHS && VT == MVT::i64) {
1327213363
if (SDValue Split =
1327313364
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
443443
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
444444
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
445445
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
446+
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
446447

447448
Register getRegisterByName(const char* RegName, LLT VT,
448449
const MachineFunction &MF) const override;

0 commit comments

Comments
 (0)