-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[X86] AVX512 optimised CTLZ/CTTZ implementations for i256/i512 scalars #164671
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
6ee5b3e
bf0d058
01aff8f
021da37
ddc7da3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2654,6 +2654,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, | |
| ISD::AVGCEILU, | ||
| ISD::AVGFLOORS, | ||
| ISD::AVGFLOORU, | ||
| ISD::CTLZ, | ||
| ISD::CTTZ, | ||
| ISD::CTLZ_ZERO_UNDEF, | ||
| ISD::CTTZ_ZERO_UNDEF, | ||
| ISD::BITREVERSE, | ||
| ISD::ADD, | ||
| ISD::FADD, | ||
|
|
@@ -55162,6 +55166,61 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, | |
| return combineFneg(N, DAG, DCI, Subtarget); | ||
| } | ||
|
|
||
| // Fold i256/i512 CTLZ/CTTZ patterns to make use of AVX512 | ||
| // vXi64 CTLZ/CTTZ and VECTOR_COMPRESS. | ||
| // Compute the CTLZ/CTTZ of each element, add the element's bit offset, compress | ||
| // the result to remove all zero elements (passthru is set to scalar bitwidth if | ||
| // all elements are zero) and extract the lowest compressed element. | ||
| static SDValue combineCTZ(SDNode *N, SelectionDAG &DAG, | ||
| TargetLowering::DAGCombinerInfo &DCI, | ||
| const X86Subtarget &Subtarget) { | ||
| EVT VT = N->getValueType(0); | ||
| SDValue N0 = N->getOperand(0); | ||
| unsigned Opc = N->getOpcode(); | ||
| unsigned SizeInBits = VT.getSizeInBits(); | ||
| assert((Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF || Opc == ISD::CTTZ || | ||
| Opc == ISD::CTTZ_ZERO_UNDEF) && | ||
| "Unsupported bit count"); | ||
|
|
||
| if (VT.isScalarInteger() && Subtarget.hasCDI() && | ||
| ((SizeInBits == 512 && Subtarget.useAVX512Regs()) || | ||
| (SizeInBits == 256 && Subtarget.hasVLX() && | ||
| X86::mayFoldLoad(N0, Subtarget)))) { | ||
| MVT VecVT = MVT::getVectorVT(MVT::i64, SizeInBits / 64); | ||
| MVT BoolVT = VecVT.changeVectorElementType(MVT::i1); | ||
| SDValue Vec = DAG.getBitcast(VecVT, N0); | ||
| SDLoc DL(N); | ||
|
|
||
| SmallVector<int, 8> RevMask; | ||
| SmallVector<SDValue, 8> Offsets; | ||
| for (unsigned I = 0, E = VecVT.getVectorNumElements(); I != E; ++I) { | ||
| RevMask.push_back((int)((E - 1) - I)); | ||
| Offsets.push_back(DAG.getConstant(I * 64, DL, MVT::i64)); | ||
| } | ||
|
|
||
| // CTLZ - reverse the elements as we want the top non-zero element. | ||
| if (Opc == ISD::CTLZ) | ||
| Vec = DAG.getVectorShuffle(VecVT, DL, Vec, Vec, RevMask); | ||
|
|
||
| SDValue PassThrough = DAG.getUNDEF(VecVT); | ||
| if (Opc == ISD::CTLZ || Opc == ISD::CTTZ) | ||
| PassThrough = DAG.getConstant(SizeInBits, DL, VecVT); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should it be 64 instead of
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No - I've added the offsets to each element at this point, the pass through is for the compress and will only appear in elt[0] if the entire vector is zero - in which case it should return the full scalar integer width (256/512). |
||
|
|
||
| SDValue IsNonZero = DAG.getSetCC(DL, BoolVT, Vec, | ||
| DAG.getConstant(0, DL, VecVT), ISD::SETNE); | ||
| SDValue Cnt = DAG.getNode(Opc, DL, VecVT, Vec); | ||
| Cnt = DAG.getNode(ISD::ADD, DL, VecVT, Cnt, | ||
| DAG.getBuildVector(VecVT, DL, Offsets)); | ||
| Cnt = DAG.getNode(ISD::VECTOR_COMPRESS, DL, VecVT, Cnt, IsNonZero, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we missing Cnt for
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice catch - yes, we mustn't use the ZERO_UNDEF variants on the vector op. |
||
| PassThrough); | ||
| Cnt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cnt, | ||
| DAG.getVectorIdxConstant(0, DL)); | ||
| return DAG.getZExtOrTrunc(Cnt, DL, VT); | ||
| } | ||
|
|
||
| return SDValue(); | ||
| } | ||
|
|
||
| static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, | ||
| TargetLowering::DAGCombinerInfo &DCI, | ||
| const X86Subtarget &Subtarget) { | ||
|
|
@@ -60885,6 +60944,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, | |
| case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); | ||
| case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); | ||
| case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); | ||
| case ISD::CTLZ: | ||
| case ISD::CTTZ: | ||
| case ISD::CTLZ_ZERO_UNDEF: | ||
| case ISD::CTTZ_ZERO_UNDEF:return combineCTZ(N, DAG, DCI, Subtarget); | ||
| case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget); | ||
| case ISD::AVGCEILS: | ||
| case ISD::AVGCEILU: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand it. Isn't the MSB in the top element already?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For CTLZ we need to isolate the first non-zero element from the end of the vector - but to then use the compress trick we need it to be reversed to be in the first element instead.