-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[X86] Use an FP-based expansion for v4i32 ctlz on SSE2-only targets #167034
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -11,6 +11,7 @@ | |||||||||||||||
| //===----------------------------------------------------------------------===// | ||||||||||||||||
|
|
||||||||||||||||
| #include "llvm/CodeGen/TargetLowering.h" | ||||||||||||||||
| #include "llvm/ADT/APFloat.h" | ||||||||||||||||
| #include "llvm/ADT/STLExtras.h" | ||||||||||||||||
| #include "llvm/Analysis/ValueTracking.h" | ||||||||||||||||
| #include "llvm/Analysis/VectorUtils.h" | ||||||||||||||||
|
|
@@ -9480,6 +9481,55 @@ SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const { | |||||||||||||||
| return DAG.getNode(ISD::VP_CTPOP, dl, VT, Op, Mask, VL); | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| SDValue TargetLowering::expandCTLZWithFP(SDNode *Node, SelectionDAG &DAG) const { | ||||||||||||||||
| // pseudocode : | ||||||||||||||||
| // if(x==0) return 32; | ||||||||||||||||
| // float f = (float) x; | ||||||||||||||||
| // int i = bitcast<int>(f); | ||||||||||||||||
| // int ilog2 = (i >> 23) - 127; | ||||||||||||||||
| // return 31 - ilog2; | ||||||||||||||||
|
|
||||||||||||||||
| SDLoc dl(Node); | ||||||||||||||||
| SDValue Op = Node->getOperand(0); | ||||||||||||||||
| EVT VT = Op.getValueType(); | ||||||||||||||||
|
|
||||||||||||||||
| assert(VT.isVector() && "This expansion is intended for vectors"); | ||||||||||||||||
|
|
||||||||||||||||
| EVT EltVT = VT.getVectorElementType(); | ||||||||||||||||
| EVT FloatVT, CmpVT; | ||||||||||||||||
| unsigned BitWidth, MantissaBits, ExponentBias; | ||||||||||||||||
|
|
||||||||||||||||
| // Converting to float type | ||||||||||||||||
| if (EltVT == MVT::i32) { | ||||||||||||||||
| FloatVT = VT.changeVectorElementType(MVT::f32); | ||||||||||||||||
| const fltSemantics &Sem = FloatVT.getVectorElementType().getFltSemantics(); | ||||||||||||||||
| BitWidth = EltVT.getSizeInBits(); | ||||||||||||||||
| MantissaBits = APFloat::semanticsPrecision(Sem) - 1; | ||||||||||||||||
| ExponentBias = | ||||||||||||||||
| static_cast<unsigned>(-APFloat::semanticsMinExponent(Sem) + 1); | ||||||||||||||||
| } | ||||||||||||||||
| else { | ||||||||||||||||
| return SDValue(); | ||||||||||||||||
| } | ||||||||||||||||
|
Comment on lines
+9511
to
+9514
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||
|
|
||||||||||||||||
| // Handling the case for when Op == 0 which is stored in ZeroRes | ||||||||||||||||
| CmpVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); | ||||||||||||||||
| SDValue Zero = DAG.getConstant(0, dl, VT); | ||||||||||||||||
| SDValue IsZero = DAG.getSetCC(dl, CmpVT, Op, Zero, ISD::SETEQ); | ||||||||||||||||
| SDValue ZeroRes = DAG.getConstant(BitWidth, dl, VT); | ||||||||||||||||
|
Comment on lines
+9516
to
+9520
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can omit the |
||||||||||||||||
|
|
||||||||||||||||
| // Handling the case for Non-zero inputs using the algorithm mentioned below | ||||||||||||||||
| SDValue Float = DAG.getNode(ISD::UINT_TO_FP, dl, FloatVT, Op); | ||||||||||||||||
| SDValue FloatBits = DAG.getNode(ISD::BITCAST, dl, VT, Float); | ||||||||||||||||
| SDValue Exp = DAG.getNode(ISD::SRL, dl, VT, FloatBits, DAG.getConstant(MantissaBits, dl, VT)); | ||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should use getShiftAmountTy / getShiftAmountConstant |
||||||||||||||||
| SDValue MSBIndex = DAG.getNode(ISD::SUB, dl, VT, Exp, DAG.getConstant(ExponentBias, dl, VT)); | ||||||||||||||||
| SDValue NonZeroRes = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(BitWidth - 1, dl, VT), MSBIndex); | ||||||||||||||||
|
Comment on lines
+9526
to
+9527
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggest emitting |
||||||||||||||||
|
|
||||||||||||||||
| //Returns the respective DAG Node based on the input being zero or non-zero | ||||||||||||||||
| return DAG.getNode(ISD::VSELECT, dl, VT, IsZero, ZeroRes, NonZeroRes); | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG, | ||||||||||||||||
| const SDLoc &DL, EVT VT, SDValue Op, | ||||||||||||||||
| unsigned BitWidth) const { | ||||||||||||||||
|
|
||||||||||||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -1348,6 +1348,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, | |||||
| setOperationAction(ISD::SUB, MVT::i32, Custom); | ||||||
| } | ||||||
|
|
||||||
| if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2() && | ||||||
| !Subtarget.hasSSSE3()) { | ||||||
| setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); | ||||||
| setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom); | ||||||
| } | ||||||
|
|
||||||
| if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { | ||||||
| for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { | ||||||
| setOperationAction(ISD::FFLOOR, RoundedTy, Legal); | ||||||
|
|
@@ -29039,6 +29045,13 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, | |||||
| if (VT.is512BitVector() && !Subtarget.hasBWI()) | ||||||
| return splitVectorIntUnary(Op, DAG, DL); | ||||||
|
|
||||||
| if (VT == MVT::v4i32 && Subtarget.hasSSE2() && !Subtarget.hasSSSE3()) { | ||||||
| const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| SDValue New = TLI.expandCTLZWithFP(Op.getNode(), DAG); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| if (New.getNode()) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| return New; | ||||||
| } | ||||||
|
|
||||||
| assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"); | ||||||
| return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); | ||||||
| } | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. all 3 of these new test files can be dropped - just regenerate the vector-lzcnt-128.ll checks with the update script |
||
|
|
||
| define <4 x i32> @test_v4i32_sse2(<4 x i32> %a) #0 { | ||
| ; CHECK-LABEL: test_v4i32_sse2: | ||
| ; CHECK: # %bb.0: | ||
|
|
||
| ; Zero test (strict CTLZ needs select) | ||
| ; CHECK-DAG: pcmpeqd %xmm{{[0-9]+}}, %xmm{{[0-9]+}} | ||
|
|
||
| ; Exponent extraction + bias arithmetic (order-free) | ||
| ; CHECK-DAG: psrld {{\$}}23, %xmm{{[0-9]+}} | ||
| ; CHECK-DAG: psubd %xmm{{[0-9]+}}, %xmm{{[0-9]+}} | ||
|
|
||
| ; Select/merge (could be por/pandn etc.) | ||
| ; CHECK: por %xmm{{[0-9]+}}, %xmm{{[0-9]+}} | ||
|
|
||
| ; Must NOT use SSSE3 LUT path | ||
| ; CHECK-NOT: pshufb | ||
|
|
||
| ; CHECK: retq | ||
| %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) | ||
| ret <4 x i32> %res | ||
| } | ||
|
|
||
| declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) | ||
| attributes #0 = { "optnone" } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s | ||
|
|
||
| define <4 x i32> @test_v4i32_sse2_zero_undef(<4 x i32> %a) #0 { | ||
| ; CHECK-LABEL: test_v4i32_sse2_zero_undef: | ||
|
|
||
| ; zero check | ||
| ; CHECK-DAG: pcmpeqd | ||
|
|
||
| ; FP-based mantissa/exponent steps (order may vary) | ||
| ; CHECK-DAG: psrld $16 | ||
| ; CHECK-DAG: subps | ||
| ; CHECK-DAG: psrld $23 | ||
| ; CHECK-DAG: psubd | ||
|
|
||
| ; merge/select | ||
| ; CHECK: pandn | ||
| ; CHECK: por | ||
|
|
||
| ; CHECK-NOT: pshufb | ||
|
|
||
| ; CHECK: retq | ||
|
|
||
| %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true) | ||
| ret <4 x i32> %res | ||
| } | ||
|
|
||
| declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) | ||
| attributes #0 = { "optnone" } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 -o - | FileCheck %s | ||
|
|
||
| ; This verifies that **with SSSE3 enabled**, we use the LUT-based `pshufb` | ||
| ; implementation and *not* the floating-point exponent trick. | ||
|
|
||
| define <4 x i32> @test_v4i32_ssse3(<4 x i32> %a) { | ||
| ; CHECK-LABEL: test_v4i32_ssse3: | ||
| ; CHECK: # %bb.0: | ||
|
|
||
| ; Must use SSSE3 table LUT: | ||
| ; CHECK: pshufb | ||
|
|
||
| ; Must NOT use FP exponent trick: | ||
| ; CHECK-NOT: cvtdq2ps | ||
| ; CHECK-NOT: psrld $23 | ||
| ; CHECK-NOT: psubd | ||
|
|
||
| ; CHECK: retq | ||
| %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) | ||
| ret <4 x i32> %res | ||
| } | ||
|
|
||
| declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can just write to not have this limitation