Skip to content

Commit ee2f52d

Browse files
committed
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of
64-bit wide instructions Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these causes a number of test regressions, so extra work in the combiner and Tablegen patterns was necessary. - Use custom for v2i32 rotr instead of additional patterns. Modify PerformOrCombine() to remove some identity or operations - Fix rotr regression by adding lowerRotr() on the legalizer codepath. - Add test case to rotr.ll - Extend performFNEGCombine() for the SELECT case. - Modify performSelectCombine() and foldFreeOpFromSelect to prevent the performFNEGCombine() changes from being unwound. - Add cases to or.ll and xor.ll to demonstrate the generation of the s_or_64 and s_xor_64 instructions for the v2i32 cases. Previously this was inhibited by "-amdgpu-scalarize-global-loads=false". - Fix shl/srl64_reduce regression by performing the scalarisation previously performewd by the vector legaliser in the combiner.
1 parent 4595d5f commit ee2f52d

16 files changed

+1805
-218
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 136 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4033,9 +4033,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
40334033
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
40344034
/// binary operation \p Opc to it with the corresponding constant operands.
40354035
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4036-
DAGCombinerInfo &DCI, const SDLoc &SL,
4037-
unsigned Opc, SDValue LHS,
4038-
uint32_t ValLo, uint32_t ValHi) const {
4036+
DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4037+
uint32_t ValLo, uint32_t ValHi) const {
40394038
SelectionDAG &DAG = DCI.DAG;
40404039
SDValue Lo, Hi;
40414040
std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
@@ -4064,6 +4063,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40644063
SDLoc SL(N);
40654064
SelectionDAG &DAG = DCI.DAG;
40664065

4066+
// When the shl64_reduce optimisation code is passed through vector
4067+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4068+
// resulted in the AND instructions no longer being elided, as mentioned
4069+
// below. The following code should make sure this takes place.
4070+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4071+
SDValue VAND = RHS.getOperand(0);
4072+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4073+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4074+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4075+
SDValue LHSAND = VAND.getOperand(0);
4076+
SDValue RHSAND = VAND.getOperand(1);
4077+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4078+
// Part of shlcombine is to optimise for the case where its possible
4079+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4080+
// transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4081+
// '&' is then elided by ISel. The vector code for this was being
4082+
// completely scalarised by the vector legalizer, but now v2i32 is
4083+
// made legal the vector legaliser only partially scalarises the
4084+
// vector operations and the and was not elided. This check enables us
4085+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4086+
// the and instruction.
4087+
ConstantSDNode *CANDL =
4088+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4089+
ConstantSDNode *CANDR =
4090+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4091+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4092+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4093+
// Get the non-const AND operands and produce scalar AND
4094+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4095+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4096+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4097+
LHSAND, Zero);
4098+
SDValue Hi =
4099+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4100+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4101+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4102+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4103+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4104+
if (AndIndex == 0 || AndIndex == 1)
4105+
return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc,
4106+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4107+
}
4108+
}
4109+
}
4110+
}
4111+
}
4112+
40674113
unsigned RHSVal;
40684114
if (CRHS) {
40694115
RHSVal = CRHS->getZExtValue();
@@ -4105,8 +4151,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
41054151
if (VT.getScalarType() != MVT::i64)
41064152
return SDValue();
41074153

4108-
// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4109-
41104154
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
41114155
// common case, splitting this into a move and a 32-bit shift is faster and
41124156
// the same code size.
@@ -4268,6 +4312,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
42684312
SDLoc SL(N);
42694313
unsigned RHSVal;
42704314

4315+
// When the shl64_reduce optimisation code is passed through vector
4316+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4317+
// resulted in the AND instructions no longer being elided, as mentioned
4318+
// below. The following code should make sure this takes place.
4319+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4320+
SDValue VAND = RHS.getOperand(0);
4321+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4322+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4323+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4324+
SDValue LHSAND = VAND.getOperand(0);
4325+
SDValue RHSAND = VAND.getOperand(1);
4326+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4327+
// Part of srlcombine is to optimise for the case where its possible
4328+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4329+
// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4330+
// '&' is then elided by ISel. The vector code for this was being
4331+
// completely scalarised by the vector legalizer, but now v2i32 is
4332+
// made legal the vector legaliser only partially scalarises the
4333+
// vector operations and the and was not elided. This check enables us
4334+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4335+
// the and instruction.
4336+
ConstantSDNode *CANDL =
4337+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4338+
ConstantSDNode *CANDR =
4339+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4340+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4341+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4342+
// Get the non-const AND operands and produce scalar AND
4343+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4344+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4345+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4346+
LHSAND, Zero);
4347+
SDValue Hi =
4348+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4349+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4350+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4351+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4352+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4353+
if (AndIndex == 0 || AndIndex == 1)
4354+
return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
4355+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4356+
}
4357+
}
4358+
}
4359+
}
4360+
}
4361+
42714362
if (CRHS) {
42724363
RHSVal = CRHS->getZExtValue();
42734364

@@ -4781,8 +4872,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
47814872
if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
47824873
return SDValue();
47834874

4784-
return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4785-
SDLoc(N), Cond, LHS, RHS);
4875+
// select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4876+
// lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4877+
// out in this case. For now I've made the logic as specific to the case as
4878+
// possible, hopefully this can be relaxed in future.
4879+
if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) {
4880+
SDValue LHSB = LHS.getOperand(0);
4881+
SDValue RHSB = RHS.getOperand(0);
4882+
if (LHSB.getOpcode() == ISD::BITCAST &&
4883+
RHSB->getOpcode() == ISD::BITCAST) {
4884+
EVT LHSBOpTy = LHSB->getOperand(0).getValueType();
4885+
EVT RHSBOpTy = RHSB->getOperand(0).getValueType();
4886+
if (LHSB.getValueType() == MVT::f32 &&
4887+
RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 &&
4888+
RHSBOpTy == MVT::i32)
4889+
return SDValue();
4890+
}
4891+
}
4892+
4893+
return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
4894+
RHS);
47864895
}
47874896

47884897
bool Inv = false;
@@ -4835,8 +4944,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
48354944
if (Inv)
48364945
std::swap(NewLHS, NewRHS);
48374946

4838-
SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4839-
Cond, NewLHS, NewRHS);
4947+
SDValue NewSelect =
4948+
DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
48404949
DCI.AddToWorklist(NewSelect.getNode());
48414950
return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
48424951
}
@@ -5257,8 +5366,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
52575366
}
52585367
case ISD::SELECT: {
52595368
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5369+
// This combine became necessary recently to prevent a regression in
5370+
// fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5371+
// Specifically, additional instructions were added to the final codegen.
5372+
// When adding this combine a case was added to performFNEGCombine to
5373+
// prevent this combine from being undone under certain conditions.
52605374
// TODO: Invert conditions of foldFreeOpFromSelect
5261-
return SDValue();
5375+
SDValue Cond = N0.getOperand(0);
5376+
SDValue LHS = N0.getOperand(1);
5377+
SDValue RHS = N0.getOperand(2);
5378+
EVT LHVT = LHS.getValueType();
5379+
EVT RHVT = RHS.getValueType();
5380+
// The regression was limited to i32 v2/i32.
5381+
if (RHVT != MVT::i32 && LHVT != MVT::i32)
5382+
return SDValue();
5383+
5384+
SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
5385+
SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);
5386+
SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5387+
return Op;
52625388
}
52635389
case ISD::BITCAST: {
52645390
SDLoc SL(N);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
440440
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
441441
}
442442

443+
setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
444+
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
445+
// instead lower to cndmask in SITargetLowering::LowerSELECT().
446+
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
447+
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
448+
// alignbit.
449+
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
450+
443451
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
444452
Custom);
445453

@@ -6075,6 +6083,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
60756083
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
60766084
}
60776085

6086+
// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6087+
// regression whereby extra unnecessary instructions were added to codegen
6088+
// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6089+
// instructions to extract the result from the vector.
6090+
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
6091+
[[maybe_unused]] EVT VT = Op.getValueType();
6092+
6093+
assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6094+
VT == MVT::v16i32) &&
6095+
"Unexpected ValueType.");
6096+
6097+
return DAG.UnrollVectorOp(Op.getNode());
6098+
}
6099+
60786100
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
60796101
// wider vector type is legal.
60806102
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6266,6 +6288,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
62666288
return lowerGET_FPENV(Op, DAG);
62676289
case ISD::SET_FPENV:
62686290
return lowerSET_FPENV(Op, DAG);
6291+
case ISD::ROTR:
6292+
return lowerROTR(Op, DAG);
62696293
}
62706294
return SDValue();
62716295
}
@@ -13247,6 +13271,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1324713271
}
1324813272
}
1324913273

13274+
// Detect identity v2i32 OR and replace with identity source node.
13275+
// Specifically an Or that has operands constructed from the same source node
13276+
// via extract_vector_elt and build_vector. I.E.
13277+
// v2i32 or(
13278+
// v2i32 build_vector(
13279+
// i32 extract_elt(%IdentitySrc, 0),
13280+
// i32 0
13281+
// ),
13282+
// v2i32 build_vector(
13283+
// i32 0,
13284+
// i32 extract_elt(%IdentitySrc, 1)
13285+
// ) )
13286+
// =>
13287+
// v2i32 %IdentitySrc
13288+
13289+
if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
13290+
RHS->getOpcode() == ISD::BUILD_VECTOR) {
13291+
13292+
ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13293+
ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
13294+
13295+
// Test for and normalise build vectors.
13296+
if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
13297+
13298+
// Get the extract_vector_element operands.
13299+
SDValue LEVE = LHS->getOperand(0);
13300+
SDValue REVE = RHS->getOperand(1);
13301+
13302+
if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
13303+
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13304+
// Check that different elements from the same vector are
13305+
// extracted.
13306+
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
13307+
LEVE->getOperand(1) != REVE->getOperand(1)) {
13308+
SDValue IdentitySrc = LEVE.getOperand(0);
13309+
return IdentitySrc;
13310+
}
13311+
}
13312+
}
13313+
}
13314+
1325013315
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
1325113316
return SDValue();
1325213317

@@ -13291,13 +13356,39 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1329113356
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
1329213357
return RV;
1329313358

13359+
SelectionDAG &DAG = DCI.DAG;
13360+
EVT VT = N->getValueType(0);
1329413361
SDValue LHS = N->getOperand(0);
1329513362
SDValue RHS = N->getOperand(1);
1329613363

13364+
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
13365+
13366+
const ConstantSDNode *CRHS0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
13367+
const ConstantSDNode *CRHS1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13368+
SDValue LHS_0 = LHS.getOperand(0);
13369+
SDValue LHS_1 = LHS.getOperand(1);
13370+
13371+
if (LHS.getOpcode() == ISD::VSELECT && CRHS0 &&
13372+
CRHS0->getAPIntValue().isSignMask() &&
13373+
shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 &&
13374+
CRHS1->getAPIntValue().isSignMask() &&
13375+
shouldFoldFNegIntoSrc(N, LHS_1)) {
13376+
13377+
SDLoc DL(N);
13378+
SDValue CastLHS =
13379+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
13380+
SDValue CastRHS =
13381+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
13382+
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
13383+
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
13384+
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
13385+
LHS->getOperand(0), FNegLHS, FNegRHS);
13386+
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13387+
}
13388+
}
13389+
1329713390
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13298-
SelectionDAG &DAG = DCI.DAG;
1329913391

13300-
EVT VT = N->getValueType(0);
1330113392
if (CRHS && VT == MVT::i64) {
1330213393
if (SDValue Split =
1330313394
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
443443
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
444444
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
445445
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
446+
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
446447

447448
Register getRegisterByName(const char* RegName, LLT VT,
448449
const MachineFunction &MF) const override;

0 commit comments

Comments
 (0)