Skip to content

Commit 4cb21c3

Browse files
committed
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of
64-bit wide instructions Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these causes a number of test regressions, so extra work in the combiner and Tablegen patterns was necessary. - Use custom for v2i32 rotr instead of additional patterns. Modify PerformOrCombine() to remove some identity or operations - Fix rotr regression by adding lowerRotr() on the legalizer codepath. - Add test case to rotr.ll - Extend performFNEGCombine() for the SELECT case. - Modify performSelectCombine() and foldFreeOpFromSelect to prevent the performFNEGCombine() changes from being unwound. - Add cases to or.ll and xor.ll to demonstrate the generation of the s_or_64 and s_xor_64 instructions for the v2i32 cases. Previously this was inhibited by "-amdgpu-scalarize-global-loads=false". - Fix shl/srl64_reduce regression by performing the scalarisation previously performewd by the vector legaliser in the combiner.
1 parent 9f102a9 commit 4cb21c3

16 files changed

+1807
-219
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 136 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4026,9 +4026,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
40264026
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
40274027
/// binary operation \p Opc to it with the corresponding constant operands.
40284028
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4029-
DAGCombinerInfo &DCI, const SDLoc &SL,
4030-
unsigned Opc, SDValue LHS,
4031-
uint32_t ValLo, uint32_t ValHi) const {
4029+
DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4030+
uint32_t ValLo, uint32_t ValHi) const {
40324031
SelectionDAG &DAG = DCI.DAG;
40334032
SDValue Lo, Hi;
40344033
std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
@@ -4057,6 +4056,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40574056
SDLoc SL(N);
40584057
SelectionDAG &DAG = DCI.DAG;
40594058

4059+
// When the shl64_reduce optimisation code is passed through vector
4060+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4061+
// resulted in the AND instructions no longer being elided, as mentioned
4062+
// below. The following code should make sure this takes place.
4063+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4064+
SDValue VAND = RHS.getOperand(0);
4065+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4066+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4067+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4068+
SDValue LHSAND = VAND.getOperand(0);
4069+
SDValue RHSAND = VAND.getOperand(1);
4070+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4071+
// Part of shlcombine is to optimise for the case where its possible
4072+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4073+
// transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4074+
// '&' is then elided by ISel. The vector code for this was being
4075+
// completely scalarised by the vector legalizer, but now v2i32 is
4076+
// made legal the vector legaliser only partially scalarises the
4077+
// vector operations and the and was not elided. This check enables us
4078+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4079+
// the and instruction.
4080+
ConstantSDNode *CANDL =
4081+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4082+
ConstantSDNode *CANDR =
4083+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4084+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4085+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4086+
// Get the non-const AND operands and produce scalar AND
4087+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4088+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4089+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4090+
LHSAND, Zero);
4091+
SDValue Hi =
4092+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4093+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4094+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4095+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4096+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4097+
if (AndIndex == 0 || AndIndex == 1)
4098+
return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc,
4099+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4100+
}
4101+
}
4102+
}
4103+
}
4104+
}
4105+
40604106
unsigned RHSVal;
40614107
if (CRHS) {
40624108
RHSVal = CRHS->getZExtValue();
@@ -4098,8 +4144,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40984144
if (VT.getScalarType() != MVT::i64)
40994145
return SDValue();
41004146

4101-
// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4102-
41034147
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
41044148
// common case, splitting this into a move and a 32-bit shift is faster and
41054149
// the same code size.
@@ -4261,6 +4305,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
42614305
SDLoc SL(N);
42624306
unsigned RHSVal;
42634307

4308+
// When the shl64_reduce optimisation code is passed through vector
4309+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4310+
// resulted in the AND instructions no longer being elided, as mentioned
4311+
// below. The following code should make sure this takes place.
4312+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4313+
SDValue VAND = RHS.getOperand(0);
4314+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4315+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4316+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4317+
SDValue LHSAND = VAND.getOperand(0);
4318+
SDValue RHSAND = VAND.getOperand(1);
4319+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4320+
// Part of srlcombine is to optimise for the case where its possible
4321+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4322+
// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4323+
// '&' is then elided by ISel. The vector code for this was being
4324+
// completely scalarised by the vector legalizer, but now v2i32 is
4325+
// made legal the vector legaliser only partially scalarises the
4326+
// vector operations and the and was not elided. This check enables us
4327+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4328+
// the and instruction.
4329+
ConstantSDNode *CANDL =
4330+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4331+
ConstantSDNode *CANDR =
4332+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4333+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4334+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4335+
// Get the non-const AND operands and produce scalar AND
4336+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4337+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4338+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4339+
LHSAND, Zero);
4340+
SDValue Hi =
4341+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4342+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4343+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4344+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4345+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4346+
if (AndIndex == 0 || AndIndex == 1)
4347+
return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
4348+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4349+
}
4350+
}
4351+
}
4352+
}
4353+
}
4354+
42644355
if (CRHS) {
42654356
RHSVal = CRHS->getZExtValue();
42664357

@@ -4774,8 +4865,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
47744865
if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
47754866
return SDValue();
47764867

4777-
return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4778-
SDLoc(N), Cond, LHS, RHS);
4868+
// select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4869+
// lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4870+
// out in this case. For now I've made the logic as specific to the case as
4871+
// possible, hopefully this can be relaxed in future.
4872+
if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) {
4873+
SDValue LHSB = LHS.getOperand(0);
4874+
SDValue RHSB = RHS.getOperand(0);
4875+
if (LHSB.getOpcode() == ISD::BITCAST &&
4876+
RHSB->getOpcode() == ISD::BITCAST) {
4877+
EVT LHSBOpTy = LHSB->getOperand(0).getValueType();
4878+
EVT RHSBOpTy = RHSB->getOperand(0).getValueType();
4879+
if (LHSB.getValueType() == MVT::f32 &&
4880+
RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 &&
4881+
RHSBOpTy == MVT::i32)
4882+
return SDValue();
4883+
}
4884+
}
4885+
4886+
return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
4887+
RHS);
47794888
}
47804889

47814890
bool Inv = false;
@@ -4828,8 +4937,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
48284937
if (Inv)
48294938
std::swap(NewLHS, NewRHS);
48304939

4831-
SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4832-
Cond, NewLHS, NewRHS);
4940+
SDValue NewSelect =
4941+
DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
48334942
DCI.AddToWorklist(NewSelect.getNode());
48344943
return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
48354944
}
@@ -5167,8 +5276,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
51675276
}
51685277
case ISD::SELECT: {
51695278
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5279+
// This combine became necessary recently to prevent a regression in
5280+
// fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5281+
// Specifically, additional instructions were added to the final codegen.
5282+
// When adding this combine a case was added to performFNEGCombine to
5283+
// prevent this combine from being undone under certain conditions.
51705284
// TODO: Invert conditions of foldFreeOpFromSelect
5171-
return SDValue();
5285+
SDValue Cond = N0.getOperand(0);
5286+
SDValue LHS = N0.getOperand(1);
5287+
SDValue RHS = N0.getOperand(2);
5288+
EVT LHVT = LHS.getValueType();
5289+
EVT RHVT = RHS.getValueType();
5290+
// The regression was limited to i32 v2/i32.
5291+
if (RHVT != MVT::i32 && LHVT != MVT::i32)
5292+
return SDValue();
5293+
5294+
SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
5295+
SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);
5296+
SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5297+
return Op;
51725298
}
51735299
case ISD::BITCAST: {
51745300
SDLoc SL(N);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
440440
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
441441
}
442442

443+
setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
444+
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
445+
// instead lower to cndmask in SITargetLowering::LowerSELECT().
446+
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
447+
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
448+
// alignbit.
449+
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
450+
443451
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
444452
Custom);
445453

@@ -6183,6 +6191,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
61836191
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
61846192
}
61856193

6194+
// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6195+
// regression whereby extra unnecessary instructions were added to codegen
6196+
// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6197+
// instructions to extract the result from the vector.
6198+
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
6199+
[[maybe_unused]] EVT VT = Op.getValueType();
6200+
6201+
assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6202+
VT == MVT::v16i32) &&
6203+
"Unexpected ValueType.");
6204+
6205+
return DAG.UnrollVectorOp(Op.getNode());
6206+
}
6207+
61866208
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
61876209
// wider vector type is legal.
61886210
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6374,6 +6396,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
63746396
return lowerGET_FPENV(Op, DAG);
63756397
case ISD::SET_FPENV:
63766398
return lowerSET_FPENV(Op, DAG);
6399+
case ISD::ROTR:
6400+
return lowerROTR(Op, DAG);
63776401
}
63786402
return SDValue();
63796403
}
@@ -13412,6 +13436,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1341213436
}
1341313437
}
1341413438

13439+
// Detect identity v2i32 OR and replace with identity source node.
13440+
// Specifically an Or that has operands constructed from the same source node
13441+
// via extract_vector_elt and build_vector. I.E.
13442+
// v2i32 or(
13443+
// v2i32 build_vector(
13444+
// i32 extract_elt(%IdentitySrc, 0),
13445+
// i32 0
13446+
// ),
13447+
// v2i32 build_vector(
13448+
// i32 0,
13449+
// i32 extract_elt(%IdentitySrc, 1)
13450+
// ) )
13451+
// =>
13452+
// v2i32 %IdentitySrc
13453+
13454+
if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
13455+
RHS->getOpcode() == ISD::BUILD_VECTOR) {
13456+
13457+
ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13458+
ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
13459+
13460+
// Test for and normalise build vectors.
13461+
if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
13462+
13463+
// Get the extract_vector_element operands.
13464+
SDValue LEVE = LHS->getOperand(0);
13465+
SDValue REVE = RHS->getOperand(1);
13466+
13467+
if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
13468+
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13469+
// Check that different elements from the same vector are
13470+
// extracted.
13471+
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
13472+
LEVE->getOperand(1) != REVE->getOperand(1)) {
13473+
SDValue IdentitySrc = LEVE.getOperand(0);
13474+
return IdentitySrc;
13475+
}
13476+
}
13477+
}
13478+
}
13479+
1341513480
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
1341613481
return SDValue();
1341713482

@@ -13456,13 +13521,39 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1345613521
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
1345713522
return RV;
1345813523

13524+
SelectionDAG &DAG = DCI.DAG;
13525+
EVT VT = N->getValueType(0);
1345913526
SDValue LHS = N->getOperand(0);
1346013527
SDValue RHS = N->getOperand(1);
1346113528

13529+
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
13530+
13531+
const ConstantSDNode *CRHS0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
13532+
const ConstantSDNode *CRHS1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13533+
SDValue LHS_0 = LHS.getOperand(0);
13534+
SDValue LHS_1 = LHS.getOperand(1);
13535+
13536+
if (LHS.getOpcode() == ISD::VSELECT && CRHS0 &&
13537+
CRHS0->getAPIntValue().isSignMask() &&
13538+
shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 &&
13539+
CRHS1->getAPIntValue().isSignMask() &&
13540+
shouldFoldFNegIntoSrc(N, LHS_1)) {
13541+
13542+
SDLoc DL(N);
13543+
SDValue CastLHS =
13544+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
13545+
SDValue CastRHS =
13546+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
13547+
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
13548+
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
13549+
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
13550+
LHS->getOperand(0), FNegLHS, FNegRHS);
13551+
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13552+
}
13553+
}
13554+
1346213555
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13463-
SelectionDAG &DAG = DCI.DAG;
1346413556

13465-
EVT VT = N->getValueType(0);
1346613557
if (CRHS && VT == MVT::i64) {
1346713558
if (SDValue Split =
1346813559
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
444444
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
445445
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
446446
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
447+
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
447448

448449
Register getRegisterByName(const char* RegName, LLT VT,
449450
const MachineFunction &MF) const override;

0 commit comments

Comments
 (0)