-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Convert more 64-bit lshr to 32-bit if shift amt>=32 #138204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
9435c88
0cd5c0d
a0ec5b4
5099306
90dde92
77c6b4a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4097,7 +4097,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, | |
| if (VT.getScalarType() != MVT::i64) | ||
| return SDValue(); | ||
|
|
||
| // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) | ||
| // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32)) | ||
|
|
||
| // On some subtargets, 64-bit shift is a quarter rate instruction. In the | ||
| // common case, splitting this into a move and a 32-bit shift is faster and | ||
|
|
@@ -4117,12 +4117,12 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, | |
| ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL, | ||
| TargetType); | ||
| } else { | ||
| SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); | ||
| SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); | ||
| const SDValue ShiftMask = | ||
| DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); | ||
| // This AND instruction will clamp out of bounds shift values. | ||
| // It will also be removed during later instruction selection. | ||
| ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask); | ||
| ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask); | ||
| } | ||
|
|
||
| SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS); | ||
|
|
@@ -4181,50 +4181,105 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, | |
|
|
||
| SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, | ||
| DAGCombinerInfo &DCI) const { | ||
| auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); | ||
| if (!RHS) | ||
| return SDValue(); | ||
|
|
||
| SDValue RHS = N->getOperand(1); | ||
| ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); | ||
| EVT VT = N->getValueType(0); | ||
| SDValue LHS = N->getOperand(0); | ||
| unsigned ShiftAmt = RHS->getZExtValue(); | ||
| SelectionDAG &DAG = DCI.DAG; | ||
| SDLoc SL(N); | ||
| unsigned RHSVal; | ||
|
|
||
| // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) | ||
| // this improves the ability to match BFE patterns in isel. | ||
| if (LHS.getOpcode() == ISD::AND) { | ||
| if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) { | ||
| unsigned MaskIdx, MaskLen; | ||
| if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) && | ||
| MaskIdx == ShiftAmt) { | ||
| return DAG.getNode( | ||
| ISD::AND, SL, VT, | ||
| DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)), | ||
| DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1))); | ||
| if (CRHS) { | ||
| RHSVal = CRHS->getZExtValue(); | ||
|
|
||
| // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) | ||
| // this improves the ability to match BFE patterns in isel. | ||
| if (LHS.getOpcode() == ISD::AND) { | ||
| if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) { | ||
| unsigned MaskIdx, MaskLen; | ||
| if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) && | ||
| MaskIdx == RHSVal) { | ||
| return DAG.getNode(ISD::AND, SL, VT, | ||
| DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), | ||
| N->getOperand(1)), | ||
| DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), | ||
| N->getOperand(1))); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (VT != MVT::i64) | ||
| if (VT.getScalarType() != MVT::i64) | ||
| return SDValue(); | ||
|
|
||
| if (ShiftAmt < 32) | ||
| // for C >= 32 | ||
| // i64 (srl x, C) -> (build_pair (srl hi_32(x), C -32), 0) | ||
LU-JOHN marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // On some subtargets, 64-bit shift is a quarter rate instruction. In the | ||
| // common case, splitting this into a move and a 32-bit shift is faster and | ||
| // the same code size. | ||
| KnownBits Known = DAG.computeKnownBits(RHS); | ||
|
|
||
| EVT ElementType = VT.getScalarType(); | ||
| EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext()); | ||
| EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType) | ||
| : TargetScalarType; | ||
|
|
||
| if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits()) | ||
| return SDValue(); | ||
|
|
||
| // srl i64:x, C for C >= 32 | ||
| // => | ||
| // build_pair (srl hi_32(x), C - 32), 0 | ||
| SDValue Zero = DAG.getConstant(0, SL, MVT::i32); | ||
| SDValue ShiftAmt; | ||
| if (CRHS) { | ||
| ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL, | ||
| TargetType); | ||
| } else { | ||
| SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); | ||
| const SDValue ShiftMask = | ||
| DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); | ||
| // This AND instruction will clamp out of bounds shift values. | ||
| // It will also be removed during later instruction selection. | ||
| ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask); | ||
| } | ||
|
|
||
| const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType); | ||
| EVT ConcatType; | ||
| SDValue Hi; | ||
| SDLoc LHSSL(LHS); | ||
| // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi | ||
| if (VT.isVector()) { | ||
| unsigned NElts = TargetType.getVectorNumElements(); | ||
| ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext()); | ||
| SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS); | ||
| SmallVector<SDValue, 8> HiOps(NElts); | ||
| SmallVector<SDValue, 16> HiAndLoOps; | ||
|
|
||
| SDValue Hi = getHiHalf64(LHS, DAG); | ||
| DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2); | ||
| for (unsigned I = 0; I != NElts; ++I) | ||
| HiOps[I] = HiAndLoOps[2 * I + 1]; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can simply use insert.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel array indexing is clearer than using an insert. For reference look at the last commit #132964, which switched from insert to array indexing to address feedback. |
||
| Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps); | ||
| } else { | ||
| const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType); | ||
| ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2); | ||
| SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS); | ||
| Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One); | ||
| } | ||
|
|
||
| SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32); | ||
| SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); | ||
| SDValue NewShift = DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt); | ||
|
|
||
| SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero}); | ||
| SDValue Vec; | ||
| if (VT.isVector()) { | ||
| unsigned NElts = TargetType.getVectorNumElements(); | ||
| SmallVector<SDValue, 8> LoOps; | ||
| SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero); | ||
|
|
||
| return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); | ||
| DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts); | ||
| for (unsigned I = 0; I != NElts; ++I) | ||
| HiAndLoOps[2 * I] = LoOps[I]; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. similarly, insert |
||
| Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps); | ||
| } else { | ||
| Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero}); | ||
| } | ||
| return DAG.getNode(ISD::BITCAST, SL, VT, Vec); | ||
| } | ||
|
|
||
| SDValue AMDGPUTargetLowering::performTruncateCombine( | ||
|
|
@@ -5209,21 +5264,19 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, | |
|
|
||
| break; | ||
| } | ||
| case ISD::SHL: { | ||
| case ISD::SHL: | ||
| case ISD::SRL: { | ||
| // Range metadata can be invalidated when loads are converted to legal types | ||
| // (e.g. v2i64 -> v4i32). | ||
| // Try to convert vector shl before type legalization so that range metadata | ||
| // can be utilized. | ||
| // Try to convert vector shl/srl before type legalization so that range | ||
| // metadata can be utilized. | ||
| if (!(N->getValueType(0).isVector() && | ||
| DCI.getDAGCombineLevel() == BeforeLegalizeTypes) && | ||
| DCI.getDAGCombineLevel() < AfterLegalizeDAG) | ||
| break; | ||
| return performShlCombine(N, DCI); | ||
| } | ||
| case ISD::SRL: { | ||
| if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) | ||
| break; | ||
|
|
||
| if (N->getOpcode() == ISD::SHL) { | ||
| return performShlCombine(N, DCI); | ||
| } | ||
LU-JOHN marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return performSrlCombine(N, DCI); | ||
| } | ||
| case ISD::SRA: { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we do this in DAG combine? This seems to be target dependent.