From 8b1bb2f67299480fc8553cd9997008b2188d3793 Mon Sep 17 00:00:00 2001 From: NishiB137 Date: Sat, 8 Nov 2025 02:55:30 +0530 Subject: [PATCH 1/2] [CodeGen] Add expandCTLZWithFP helper to TargetLowering supporting vXi32 types --- llvm/include/llvm/CodeGen/TargetLowering.h | 5 ++ .../CodeGen/SelectionDAG/TargetLowering.cpp | 47 +++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 98565f423df3e..bb58f48cfdb5c 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5543,6 +5543,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { /// \returns The expansion result or SDValue() if it fails. SDValue expandVPCTLZ(SDNode *N, SelectionDAG &DAG) const; + /// Expands a CTLZ node into a sequence of floating point operations. + /// \param N Node to expand + /// \returns The expansion result or SDValue() if it fails. + SDValue expandCTLZWithFP(SDNode *N, SelectionDAG &DAG) const; + /// Expand CTTZ via Table Lookup. /// \param N Node to expand /// \returns The expansion result or SDValue() if it fails. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index b51d6649af2ec..d6ab6b2fe77e2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9480,6 +9480,53 @@ SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const { return DAG.getNode(ISD::VP_CTPOP, dl, VT, Op, Mask, VL); } + +SDValue TargetLowering::expandCTLZWithFP(SDNode *Node, SelectionDAG &DAG) const { + SDLoc dl(Node); + SDValue Op = Node->getOperand(0); + EVT VT = Op.getValueType(); + + assert(VT.isVector() && "This expansion is intended for vectors"); + + EVT EltVT = VT.getVectorElementType(); + EVT FloatVT, CmpVT; + unsigned BitWidth, MantissaBits, ExponentBias; + + // Converting to float type + if (EltVT == MVT::i32) { + FloatVT = VT.changeVectorElementType(MVT::f32); + BitWidth = 32; + MantissaBits = 23; + ExponentBias = 127; + } + else { + return SDValue(); + } + + // Handling the case for when Op == 0 which is stored in ZeroRes + CmpVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue IsZero = DAG.getSetCC(dl, CmpVT, Op, Zero, ISD::SETEQ); + SDValue ZeroRes = DAG.getConstant(BitWidth, dl, VT); + + // Handling the case for Non-zero inputs using the algorithm mentioned below + SDValue Float = DAG.getNode(ISD::UINT_TO_FP, dl, FloatVT, Op); + SDValue FloatBits = DAG.getNode(ISD::BITCAST, dl, VT, Float); + SDValue Exp = DAG.getNode(ISD::SRL, dl, VT, FloatBits, DAG.getConstant(MantissaBits, dl, VT)); + SDValue MSBIndex = DAG.getNode(ISD::SUB, dl, VT, Exp, DAG.getConstant(ExponentBias, dl, VT)); + SDValue NonZeroRes = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(BitWidth - 1, dl, VT), MSBIndex); + + //Returns the respective DAG Node based on the input being zero or non-zero + return DAG.getNode(ISD::VSELECT, dl, VT, IsZero, ZeroRes, NonZeroRes); + + // pseudocode : + // if(x==0) return 32; + // float f = (float) x; + // int i = bitcast(f); + // int ilog2 = (i >> 23) - 127; + // return 31 - ilog2; +} + SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG, const SDLoc &DL, EVT VT, SDValue Op, unsigned BitWidth) const { From 078cc7a2e540e911f204f2a8cb7945a170fb2307 Mon Sep 17 00:00:00 2001 From: VindhyaP312 Date: Sat, 8 Nov 2025 03:21:35 +0530 Subject: [PATCH 2/2] [X86] Add SSE2 FP-based v4i32 CTLZ lowering and tests --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 23 ++++++++------- llvm/lib/Target/X86/X86ISelLowering.cpp | 13 +++++++++ llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll | 26 +++++++++++++++++ llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll | 28 +++++++++++++++++++ llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll | 23 +++++++++++++++ 5 files changed, 103 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll create mode 100644 llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll create mode 100644 llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index d6ab6b2fe77e2..b1aa884f56f35 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -9482,6 +9483,13 @@ SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const { SDValue TargetLowering::expandCTLZWithFP(SDNode *Node, SelectionDAG &DAG) const { + // pseudocode : + // if(x==0) return 32; + // float f = (float) x; + // int i = bitcast(f); + // int ilog2 = (i >> 23) - 127; + // return 31 - ilog2; + SDLoc dl(Node); SDValue Op = Node->getOperand(0); EVT VT = Op.getValueType(); @@ -9495,9 +9503,11 @@ SDValue TargetLowering::expandCTLZWithFP(SDNode *Node, SelectionDAG &DAG) const // Converting to float type if (EltVT == MVT::i32) { FloatVT = VT.changeVectorElementType(MVT::f32); - BitWidth = 32; - MantissaBits = 23; - ExponentBias = 127; + const fltSemantics &Sem = FloatVT.getVectorElementType().getFltSemantics(); + BitWidth = EltVT.getSizeInBits(); + MantissaBits = APFloat::semanticsPrecision(Sem) - 1; + ExponentBias = + static_cast(-APFloat::semanticsMinExponent(Sem) + 1); } else { return SDValue(); @@ -9518,13 +9528,6 @@ SDValue TargetLowering::expandCTLZWithFP(SDNode *Node, SelectionDAG &DAG) const //Returns the respective DAG Node based on the input being zero or non-zero return DAG.getNode(ISD::VSELECT, dl, VT, IsZero, ZeroRes, NonZeroRes); - - // pseudocode : - // if(x==0) return 32; - // float f = (float) x; - // int i = bitcast(f); - // int ilog2 = (i >> 23) - 127; - // return 31 - ilog2; } SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 05a854a0bf3fa..bdea6c4734908 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1348,6 +1348,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SUB, MVT::i32, Custom); } + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2() && + !Subtarget.hasSSSE3()) { + setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom); + } + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, RoundedTy, Legal); @@ -29039,6 +29045,13 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, if (VT.is512BitVector() && !Subtarget.hasBWI()) return splitVectorIntUnary(Op, DAG, DL); + if (VT == MVT::v4i32 && Subtarget.hasSSE2() && !Subtarget.hasSSSE3()) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue New = TLI.expandCTLZWithFP(Op.getNode(), DAG); + if (New.getNode()) + return New; + } + assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"); return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); } diff --git a/llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll new file mode 100644 index 0000000000000..20467b3799875 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s + +define <4 x i32> @test_v4i32_sse2(<4 x i32> %a) #0 { +; CHECK-LABEL: test_v4i32_sse2: +; CHECK: # %bb.0: + +; Zero test (strict CTLZ needs select) +; CHECK-DAG: pcmpeqd %xmm{{[0-9]+}}, %xmm{{[0-9]+}} + +; Exponent extraction + bias arithmetic (order-free) +; CHECK-DAG: psrld {{\$}}23, %xmm{{[0-9]+}} +; CHECK-DAG: psubd %xmm{{[0-9]+}}, %xmm{{[0-9]+}} + +; Select/merge (could be por/pandn etc.) +; CHECK: por %xmm{{[0-9]+}}, %xmm{{[0-9]+}} + +; Must NOT use SSSE3 LUT path +; CHECK-NOT: pshufb + +; CHECK: retq + %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) +attributes #0 = { "optnone" } diff --git a/llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll new file mode 100644 index 0000000000000..6949fe4110e58 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s + +define <4 x i32> @test_v4i32_sse2_zero_undef(<4 x i32> %a) #0 { +; CHECK-LABEL: test_v4i32_sse2_zero_undef: + +; zero check +; CHECK-DAG: pcmpeqd + +; FP-based mantissa/exponent steps (order may vary) +; CHECK-DAG: psrld $16 +; CHECK-DAG: subps +; CHECK-DAG: psrld $23 +; CHECK-DAG: psubd + +; merge/select +; CHECK: pandn +; CHECK: por + +; CHECK-NOT: pshufb + +; CHECK: retq + + %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) +attributes #0 = { "optnone" } diff --git a/llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll new file mode 100644 index 0000000000000..8d10c17223a21 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 -o - | FileCheck %s + +; This verifies that **with SSSE3 enabled**, we use the LUT-based `pshufb` +; implementation and *not* the floating-point exponent trick. + +define <4 x i32> @test_v4i32_ssse3(<4 x i32> %a) { +; CHECK-LABEL: test_v4i32_ssse3: +; CHECK: # %bb.0: + +; Must use SSSE3 table LUT: +; CHECK: pshufb + +; Must NOT use FP exponent trick: +; CHECK-NOT: cvtdq2ps +; CHECK-NOT: psrld $23 +; CHECK-NOT: psubd + +; CHECK: retq + %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)