Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -5543,6 +5543,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// \returns The expansion result or SDValue() if it fails.
SDValue expandVPCTLZ(SDNode *N, SelectionDAG &DAG) const;

/// Expands a CTLZ node into a sequence of floating point operations.
/// \param N Node to expand
/// \returns The expansion result or SDValue() if it fails.
SDValue expandCTLZWithFP(SDNode *N, SelectionDAG &DAG) const;

/// Expand CTTZ via Table Lookup.
/// \param N Node to expand
/// \returns The expansion result or SDValue() if it fails.
Expand Down
50 changes: 50 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//

#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
Expand Down Expand Up @@ -9480,6 +9481,55 @@ SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
return DAG.getNode(ISD::VP_CTPOP, dl, VT, Op, Mask, VL);
}


SDValue TargetLowering::expandCTLZWithFP(SDNode *Node, SelectionDAG &DAG) const {
// pseudocode :
// if(x==0) return 32;
// float f = (float) x;
// int i = bitcast<int>(f);
// int ilog2 = (i >> 23) - 127;
// return 31 - ilog2;

SDLoc dl(Node);
SDValue Op = Node->getOperand(0);
EVT VT = Op.getValueType();

assert(VT.isVector() && "This expansion is intended for vectors");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can just write to not have this limitation


EVT EltVT = VT.getVectorElementType();
EVT FloatVT, CmpVT;
unsigned BitWidth, MantissaBits, ExponentBias;

// Converting to float type
if (EltVT == MVT::i32) {
FloatVT = VT.changeVectorElementType(MVT::f32);
const fltSemantics &Sem = FloatVT.getVectorElementType().getFltSemantics();
BitWidth = EltVT.getSizeInBits();
MantissaBits = APFloat::semanticsPrecision(Sem) - 1;
ExponentBias =
static_cast<unsigned>(-APFloat::semanticsMinExponent(Sem) + 1);
}
else {
return SDValue();
}
Comment on lines +9511 to +9514
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
}
else {
return SDValue();
}
} else {
return SDValue();
}


// Handling the case for when Op == 0 which is stored in ZeroRes
CmpVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue IsZero = DAG.getSetCC(dl, CmpVT, Op, Zero, ISD::SETEQ);
SDValue ZeroRes = DAG.getConstant(BitWidth, dl, VT);
Comment on lines +9516 to +9520
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can omit the Op == 0 handling for CTLZ_ZERO_UNDEF.


// Handling the case for Non-zero inputs using the algorithm mentioned below
SDValue Float = DAG.getNode(ISD::UINT_TO_FP, dl, FloatVT, Op);
SDValue FloatBits = DAG.getNode(ISD::BITCAST, dl, VT, Float);
SDValue Exp = DAG.getNode(ISD::SRL, dl, VT, FloatBits, DAG.getConstant(MantissaBits, dl, VT));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should use getShiftAmountTy / getShiftAmountConstant

SDValue MSBIndex = DAG.getNode(ISD::SUB, dl, VT, Exp, DAG.getConstant(ExponentBias, dl, VT));
SDValue NonZeroRes = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(BitWidth - 1, dl, VT), MSBIndex);
Comment on lines +9526 to +9527
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest emitting (BitWidth - 1 + ExponentBias) - Exp directly here, instead of emitting two SUBs and relying on SelectionDAG to combine them for you.


//Returns the respective DAG Node based on the input being zero or non-zero
return DAG.getNode(ISD::VSELECT, dl, VT, IsZero, ZeroRes, NonZeroRes);
}

SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG,
const SDLoc &DL, EVT VT, SDValue Op,
unsigned BitWidth) const {
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1348,6 +1348,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SUB, MVT::i32, Custom);
}

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2() &&
!Subtarget.hasSSSE3()) {
setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom);
}

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
Expand Down Expand Up @@ -29039,6 +29045,13 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
if (VT.is512BitVector() && !Subtarget.hasBWI())
return splitVectorIntUnary(Op, DAG, DL);

if (VT == MVT::v4i32 && Subtarget.hasSSE2() && !Subtarget.hasSSSE3()) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const TargetLowering &TLI = DAG.getTargetLoweringInfo();

SDValue New = TLI.expandCTLZWithFP(Op.getNode(), DAG);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
SDValue New = TLI.expandCTLZWithFP(Op.getNode(), DAG);
SDValue New = expandCTLZWithFP(Op.getNode(), DAG);

if (New.getNode())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (New.getNode())
if (New)

return New;
}

assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
}
Expand Down
26 changes: 26 additions & 0 deletions llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all 3 of these new test files can be dropped - just regenerate the vector-lzcnt-128.ll checks with the update script


define <4 x i32> @test_v4i32_sse2(<4 x i32> %a) #0 {
; CHECK-LABEL: test_v4i32_sse2:
; CHECK: # %bb.0:

; Zero test (strict CTLZ needs select)
; CHECK-DAG: pcmpeqd %xmm{{[0-9]+}}, %xmm{{[0-9]+}}

; Exponent extraction + bias arithmetic (order-free)
; CHECK-DAG: psrld {{\$}}23, %xmm{{[0-9]+}}
; CHECK-DAG: psubd %xmm{{[0-9]+}}, %xmm{{[0-9]+}}

; Select/merge (could be por/pandn etc.)
; CHECK: por %xmm{{[0-9]+}}, %xmm{{[0-9]+}}

; Must NOT use SSSE3 LUT path
; CHECK-NOT: pshufb

; CHECK: retq
%res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
ret <4 x i32> %res
}

declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
attributes #0 = { "optnone" }
28 changes: 28 additions & 0 deletions llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s

define <4 x i32> @test_v4i32_sse2_zero_undef(<4 x i32> %a) #0 {
; CHECK-LABEL: test_v4i32_sse2_zero_undef:

; zero check
; CHECK-DAG: pcmpeqd

; FP-based mantissa/exponent steps (order may vary)
; CHECK-DAG: psrld $16
; CHECK-DAG: subps
; CHECK-DAG: psrld $23
; CHECK-DAG: psubd

; merge/select
; CHECK: pandn
; CHECK: por

; CHECK-NOT: pshufb

; CHECK: retq

%res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
ret <4 x i32> %res
}

declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
attributes #0 = { "optnone" }
23 changes: 23 additions & 0 deletions llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 -o - | FileCheck %s

; This verifies that **with SSSE3 enabled**, we use the LUT-based `pshufb`
; implementation and *not* the floating-point exponent trick.

define <4 x i32> @test_v4i32_ssse3(<4 x i32> %a) {
; CHECK-LABEL: test_v4i32_ssse3:
; CHECK: # %bb.0:

; Must use SSSE3 table LUT:
; CHECK: pshufb

; Must NOT use FP exponent trick:
; CHECK-NOT: cvtdq2ps
; CHECK-NOT: psrld $23
; CHECK-NOT: psubd

; CHECK: retq
%res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
ret <4 x i32> %res
}

declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
Loading