-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of 64-bit wide instructions #140694
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of 64-bit wide instructions #140694
Changes from 22 commits
4cb21c3
f0a2cae
20fbed5
3cc5ac1
9b5257e
c4fad1c
257284b
235d6a5
6046c68
3697bcb
d93e8b2
db04694
1c7dafc
57f8903
45e974f
7b31e62
2561e72
58f703e
1faff7f
c52fdbd
7fff273
701c4e9
13a1c96
a29c97f
81d8c16
65a6c60
b61db67
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4048,6 +4048,59 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( | |
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); | ||
} | ||
|
||
// Part of the shift combines is to optimise for the case where its possible | ||
// to reduce e.g shl64 to shl32 if shift range is [63-32]. This | ||
// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The | ||
// '&' is then elided by ISel. The vector code for this was being | ||
// completely scalarised by the vector legalizer, but when v2i32 is | ||
// legal the vector legaliser only partially scalarises the | ||
// vector operations and the and is not elided. This function | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, this might've been answered already in one of the review comments but what's the rationale for not just doing the optimisation here directly on the vector type instead of "preping" the sub-expression for existing optimisation? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was unable to factor out further code as it seems a little bit different for each kind of shift, but I will take another look. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @LU-JOHN please can you let me know if I have I missed anything obvious here? I think you added the reduction to each shift. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm beginning to think that scalarising the AND instruction like this may not be the most intuitive or sensible approach and that it might be worth finding some other means of ensuring it is elided, as intended by the shift optimisation. |
||
// scalarises the AND for this optimisation case. | ||
static SDValue getShiftForReduction(unsigned ShiftOpc, SDValue LHS, SDValue RHS, | ||
chrisjbris marked this conversation as resolved.
Show resolved
Hide resolved
|
||
SelectionDAG &DAG) { | ||
assert( | ||
(ShiftOpc == ISD::SRA || ShiftOpc == ISD::SRL || ShiftOpc == ISD::SHL) && | ||
"Expected shift Opcode."); | ||
|
||
chrisjbris marked this conversation as resolved.
Show resolved
Hide resolved
|
||
SDLoc SL = SDLoc(RHS); | ||
if (RHS->getOpcode() != ISD::EXTRACT_VECTOR_ELT) | ||
return SDValue(); | ||
|
||
SDValue VAND = RHS.getOperand(0); | ||
if (VAND->getOpcode() != ISD::AND) | ||
return SDValue(); | ||
|
||
ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1)); | ||
chrisjbris marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (!CRRHS) | ||
return SDValue(); | ||
|
||
SDValue LHSAND = VAND.getOperand(0); | ||
SDValue RHSAND = VAND.getOperand(1); | ||
if (RHSAND->getOpcode() != ISD::BUILD_VECTOR) | ||
return SDValue(); | ||
|
||
ConstantSDNode *CANDL = dyn_cast<ConstantSDNode>(RHSAND->getOperand(0)); | ||
ConstantSDNode *CANDR = dyn_cast<ConstantSDNode>(RHSAND->getOperand(1)); | ||
if (!CANDL || !CANDR || RHSAND->getConstantOperandVal(0) != 0x1f || | ||
RHSAND->getConstantOperandVal(1) != 0x1f) | ||
return SDValue(); | ||
// Get the non-const AND operands and produce scalar AND | ||
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); | ||
const SDValue One = DAG.getConstant(1, SL, MVT::i32); | ||
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, Zero); | ||
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One); | ||
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32); | ||
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask); | ||
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask); | ||
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); | ||
chrisjbris marked this conversation as resolved.
Show resolved
Hide resolved
|
||
uint64_t AndIndex = RHS->getConstantOperandVal(1); | ||
if (AndIndex == 0 || AndIndex == 1) | ||
return DAG.getNode(ShiftOpc, SL, MVT::i32, Trunc, | ||
AndIndex == 0 ? LoAnd : HiAnd, RHS->getFlags()); | ||
|
||
return SDValue(); | ||
} | ||
|
||
SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, | ||
DAGCombinerInfo &DCI) const { | ||
EVT VT = N->getValueType(0); | ||
|
@@ -4057,6 +4110,9 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, | |
SDLoc SL(N); | ||
SelectionDAG &DAG = DCI.DAG; | ||
|
||
if (SDValue SS = getShiftForReduction(ISD::SHL, LHS, RHS, DAG)) | ||
return SS; | ||
|
||
unsigned RHSVal; | ||
if (CRHS) { | ||
RHSVal = CRHS->getZExtValue(); | ||
|
@@ -4098,8 +4154,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, | |
if (VT.getScalarType() != MVT::i64) | ||
return SDValue(); | ||
|
||
// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32)) | ||
|
||
// On some subtargets, 64-bit shift is a quarter rate instruction. In the | ||
// common case, splitting this into a move and a 32-bit shift is faster and | ||
// the same code size. | ||
|
@@ -4159,6 +4213,9 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, | |
SelectionDAG &DAG = DCI.DAG; | ||
SDLoc SL(N); | ||
|
||
if (SDValue SS = getShiftForReduction(ISD::SRA, LHS, RHS, DAG)) | ||
return SS; | ||
|
||
if (VT.getScalarType() != MVT::i64) | ||
return SDValue(); | ||
|
||
|
@@ -4189,12 +4246,12 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, | |
(ElementType.getSizeInBits() - 1)) { | ||
ShiftAmt = ShiftFullAmt; | ||
} else { | ||
SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); | ||
SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); | ||
const SDValue ShiftMask = | ||
DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); | ||
// This AND instruction will clamp out of bounds shift values. | ||
// It will also be removed during later instruction selection. | ||
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask); | ||
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask); | ||
} | ||
|
||
EVT ConcatType; | ||
|
@@ -4261,6 +4318,9 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, | |
SDLoc SL(N); | ||
unsigned RHSVal; | ||
|
||
if (SDValue SS = getShiftForReduction(ISD::SRL, LHS, RHS, DAG)) | ||
return SS; | ||
|
||
if (CRHS) { | ||
RHSVal = CRHS->getZExtValue(); | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.