Skip to content

Commit d0186ba

Browse files
committed
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of
64-bit wide instructions Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these causes a number of test regressions, so extra work in the combiner and Tablegen patterns was necessary. - Use custom for v2i32 rotr instead of additional patterns. Modify PerformOrCombine() to remove some identity or operations - Fix rotr regression by adding lowerRotr() on the legalizer codepath. - Add test case to rotr.ll - Extend performFNEGCombine() for the SELECT case. - Modify performSelectCombine() and foldFreeOpFromSelect to prevent the performFNEGCombine() changes from being unwound. - Add cases to or.ll and xor.ll to demonstrate the generation of the s_or_64 and s_xor_64 instructions for the v2i32 cases. Previously this was inhibited by "-amdgpu-scalarize-global-loads=false". - Fix shl/srl64_reduce regression by performing the scalarisation previously performewd by the vector legaliser in the combiner.
1 parent c6c2bb7 commit d0186ba

16 files changed

+1805
-218
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 136 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4032,9 +4032,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
40324032
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
40334033
/// binary operation \p Opc to it with the corresponding constant operands.
40344034
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4035-
DAGCombinerInfo &DCI, const SDLoc &SL,
4036-
unsigned Opc, SDValue LHS,
4037-
uint32_t ValLo, uint32_t ValHi) const {
4035+
DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4036+
uint32_t ValLo, uint32_t ValHi) const {
40384037
SelectionDAG &DAG = DCI.DAG;
40394038
SDValue Lo, Hi;
40404039
std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
@@ -4063,6 +4062,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40634062
SDLoc SL(N);
40644063
SelectionDAG &DAG = DCI.DAG;
40654064

4065+
// When the shl64_reduce optimisation code is passed through vector
4066+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4067+
// resulted in the AND instructions no longer being elided, as mentioned
4068+
// below. The following code should make sure this takes place.
4069+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4070+
SDValue VAND = RHS.getOperand(0);
4071+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4072+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4073+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4074+
SDValue LHSAND = VAND.getOperand(0);
4075+
SDValue RHSAND = VAND.getOperand(1);
4076+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4077+
// Part of shlcombine is to optimise for the case where its possible
4078+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4079+
// transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4080+
// '&' is then elided by ISel. The vector code for this was being
4081+
// completely scalarised by the vector legalizer, but now v2i32 is
4082+
// made legal the vector legaliser only partially scalarises the
4083+
// vector operations and the and was not elided. This check enables us
4084+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4085+
// the and instruction.
4086+
ConstantSDNode *CANDL =
4087+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4088+
ConstantSDNode *CANDR =
4089+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4090+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4091+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4092+
// Get the non-const AND operands and produce scalar AND
4093+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4094+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4095+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4096+
LHSAND, Zero);
4097+
SDValue Hi =
4098+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4099+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4100+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4101+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4102+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4103+
if (AndIndex == 0 || AndIndex == 1)
4104+
return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc,
4105+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4106+
}
4107+
}
4108+
}
4109+
}
4110+
}
4111+
40664112
unsigned RHSVal;
40674113
if (CRHS) {
40684114
RHSVal = CRHS->getZExtValue();
@@ -4104,8 +4150,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
41044150
if (VT.getScalarType() != MVT::i64)
41054151
return SDValue();
41064152

4107-
// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4108-
41094153
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
41104154
// common case, splitting this into a move and a 32-bit shift is faster and
41114155
// the same code size.
@@ -4267,6 +4311,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
42674311
SDLoc SL(N);
42684312
unsigned RHSVal;
42694313

4314+
// When the shl64_reduce optimisation code is passed through vector
4315+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4316+
// resulted in the AND instructions no longer being elided, as mentioned
4317+
// below. The following code should make sure this takes place.
4318+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4319+
SDValue VAND = RHS.getOperand(0);
4320+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4321+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4322+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4323+
SDValue LHSAND = VAND.getOperand(0);
4324+
SDValue RHSAND = VAND.getOperand(1);
4325+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4326+
// Part of srlcombine is to optimise for the case where its possible
4327+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4328+
// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4329+
// '&' is then elided by ISel. The vector code for this was being
4330+
// completely scalarised by the vector legalizer, but now v2i32 is
4331+
// made legal the vector legaliser only partially scalarises the
4332+
// vector operations and the and was not elided. This check enables us
4333+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4334+
// the and instruction.
4335+
ConstantSDNode *CANDL =
4336+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4337+
ConstantSDNode *CANDR =
4338+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4339+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4340+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4341+
// Get the non-const AND operands and produce scalar AND
4342+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4343+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4344+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4345+
LHSAND, Zero);
4346+
SDValue Hi =
4347+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4348+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4349+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4350+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4351+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4352+
if (AndIndex == 0 || AndIndex == 1)
4353+
return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
4354+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4355+
}
4356+
}
4357+
}
4358+
}
4359+
}
4360+
42704361
if (CRHS) {
42714362
RHSVal = CRHS->getZExtValue();
42724363

@@ -4780,8 +4871,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
47804871
if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
47814872
return SDValue();
47824873

4783-
return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4784-
SDLoc(N), Cond, LHS, RHS);
4874+
// select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4875+
// lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4876+
// out in this case. For now I've made the logic as specific to the case as
4877+
// possible, hopefully this can be relaxed in future.
4878+
if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) {
4879+
SDValue LHSB = LHS.getOperand(0);
4880+
SDValue RHSB = RHS.getOperand(0);
4881+
if (LHSB.getOpcode() == ISD::BITCAST &&
4882+
RHSB->getOpcode() == ISD::BITCAST) {
4883+
EVT LHSBOpTy = LHSB->getOperand(0).getValueType();
4884+
EVT RHSBOpTy = RHSB->getOperand(0).getValueType();
4885+
if (LHSB.getValueType() == MVT::f32 &&
4886+
RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 &&
4887+
RHSBOpTy == MVT::i32)
4888+
return SDValue();
4889+
}
4890+
}
4891+
4892+
return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
4893+
RHS);
47854894
}
47864895

47874896
bool Inv = false;
@@ -4834,8 +4943,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
48344943
if (Inv)
48354944
std::swap(NewLHS, NewRHS);
48364945

4837-
SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4838-
Cond, NewLHS, NewRHS);
4946+
SDValue NewSelect =
4947+
DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
48394948
DCI.AddToWorklist(NewSelect.getNode());
48404949
return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
48414950
}
@@ -5256,8 +5365,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
52565365
}
52575366
case ISD::SELECT: {
52585367
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5368+
// This combine became necessary recently to prevent a regression in
5369+
// fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5370+
// Specifically, additional instructions were added to the final codegen.
5371+
// When adding this combine a case was added to performFNEGCombine to
5372+
// prevent this combine from being undone under certain conditions.
52595373
// TODO: Invert conditions of foldFreeOpFromSelect
5260-
return SDValue();
5374+
SDValue Cond = N0.getOperand(0);
5375+
SDValue LHS = N0.getOperand(1);
5376+
SDValue RHS = N0.getOperand(2);
5377+
EVT LHVT = LHS.getValueType();
5378+
EVT RHVT = RHS.getValueType();
5379+
// The regression was limited to i32 v2/i32.
5380+
if (RHVT != MVT::i32 && LHVT != MVT::i32)
5381+
return SDValue();
5382+
5383+
SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
5384+
SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);
5385+
SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5386+
return Op;
52615387
}
52625388
case ISD::BITCAST: {
52635389
SDLoc SL(N);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
440440
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
441441
}
442442

443+
setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
444+
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
445+
// instead lower to cndmask in SITargetLowering::LowerSELECT().
446+
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
447+
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
448+
// alignbit.
449+
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
450+
443451
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
444452
Custom);
445453

@@ -6079,6 +6087,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
60796087
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
60806088
}
60816089

6090+
// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6091+
// regression whereby extra unnecessary instructions were added to codegen
6092+
// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6093+
// instructions to extract the result from the vector.
6094+
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
6095+
[[maybe_unused]] EVT VT = Op.getValueType();
6096+
6097+
assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6098+
VT == MVT::v16i32) &&
6099+
"Unexpected ValueType.");
6100+
6101+
return DAG.UnrollVectorOp(Op.getNode());
6102+
}
6103+
60826104
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
60836105
// wider vector type is legal.
60846106
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6270,6 +6292,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
62706292
return lowerGET_FPENV(Op, DAG);
62716293
case ISD::SET_FPENV:
62726294
return lowerSET_FPENV(Op, DAG);
6295+
case ISD::ROTR:
6296+
return lowerROTR(Op, DAG);
62736297
}
62746298
return SDValue();
62756299
}
@@ -13252,6 +13276,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1325213276
}
1325313277
}
1325413278

13279+
// Detect identity v2i32 OR and replace with identity source node.
13280+
// Specifically an Or that has operands constructed from the same source node
13281+
// via extract_vector_elt and build_vector. I.E.
13282+
// v2i32 or(
13283+
// v2i32 build_vector(
13284+
// i32 extract_elt(%IdentitySrc, 0),
13285+
// i32 0
13286+
// ),
13287+
// v2i32 build_vector(
13288+
// i32 0,
13289+
// i32 extract_elt(%IdentitySrc, 1)
13290+
// ) )
13291+
// =>
13292+
// v2i32 %IdentitySrc
13293+
13294+
if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
13295+
RHS->getOpcode() == ISD::BUILD_VECTOR) {
13296+
13297+
ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13298+
ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
13299+
13300+
// Test for and normalise build vectors.
13301+
if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
13302+
13303+
// Get the extract_vector_element operands.
13304+
SDValue LEVE = LHS->getOperand(0);
13305+
SDValue REVE = RHS->getOperand(1);
13306+
13307+
if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
13308+
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13309+
// Check that different elements from the same vector are
13310+
// extracted.
13311+
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
13312+
LEVE->getOperand(1) != REVE->getOperand(1)) {
13313+
SDValue IdentitySrc = LEVE.getOperand(0);
13314+
return IdentitySrc;
13315+
}
13316+
}
13317+
}
13318+
}
13319+
1325513320
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
1325613321
return SDValue();
1325713322

@@ -13296,13 +13361,39 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1329613361
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
1329713362
return RV;
1329813363

13364+
SelectionDAG &DAG = DCI.DAG;
13365+
EVT VT = N->getValueType(0);
1329913366
SDValue LHS = N->getOperand(0);
1330013367
SDValue RHS = N->getOperand(1);
1330113368

13369+
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
13370+
13371+
const ConstantSDNode *CRHS0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
13372+
const ConstantSDNode *CRHS1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13373+
SDValue LHS_0 = LHS.getOperand(0);
13374+
SDValue LHS_1 = LHS.getOperand(1);
13375+
13376+
if (LHS.getOpcode() == ISD::VSELECT && CRHS0 &&
13377+
CRHS0->getAPIntValue().isSignMask() &&
13378+
shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 &&
13379+
CRHS1->getAPIntValue().isSignMask() &&
13380+
shouldFoldFNegIntoSrc(N, LHS_1)) {
13381+
13382+
SDLoc DL(N);
13383+
SDValue CastLHS =
13384+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
13385+
SDValue CastRHS =
13386+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
13387+
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
13388+
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
13389+
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
13390+
LHS->getOperand(0), FNegLHS, FNegRHS);
13391+
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13392+
}
13393+
}
13394+
1330213395
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13303-
SelectionDAG &DAG = DCI.DAG;
1330413396

13305-
EVT VT = N->getValueType(0);
1330613397
if (CRHS && VT == MVT::i64) {
1330713398
if (SDValue Split =
1330813399
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
443443
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
444444
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
445445
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
446+
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
446447

447448
Register getRegisterByName(const char* RegName, LLT VT,
448449
const MachineFunction &MF) const override;

0 commit comments

Comments
 (0)