llvm · NishiB137 · Nov 7, 2025 · Nov 7, 2025 · arsenm · Nov 8, 2025
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5543,6 +5543,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
   /// \returns The expansion result or SDValue() if it fails.
   SDValue expandVPCTLZ(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expands a CTLZ node into a sequence of floating point operations.
+  /// \param N Node to expand
+  /// \returns The expansion result or SDValue() if it fails.
+  SDValue expandCTLZWithFP(SDNode *N, SelectionDAG &DAG) const;
+
   /// Expand CTTZ via Table Lookup.
   /// \param N Node to expand
   /// \returns The expansion result or SDValue() if it fails.

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -9480,6 +9481,55 @@ SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::VP_CTPOP, dl, VT, Op, Mask, VL);
 }
 
+
+SDValue TargetLowering::expandCTLZWithFP(SDNode *Node, SelectionDAG &DAG) const {
+  // pseudocode :
+  // if(x==0) return 32;
+  // float f = (float) x;
+  // int i = bitcast<int>(f);
+  // int ilog2 = (i >> 23) - 127;
+  // return 31 - ilog2;
+
+  SDLoc dl(Node);
+  SDValue Op = Node->getOperand(0);
+  EVT VT = Op.getValueType();
+
+  assert(VT.isVector() && "This expansion is intended for vectors");
+
+  EVT EltVT = VT.getVectorElementType();
+  EVT FloatVT, CmpVT;
+  unsigned BitWidth, MantissaBits, ExponentBias;
+
+  // Converting to float type
+  if (EltVT == MVT::i32) {
+    FloatVT = VT.changeVectorElementType(MVT::f32);
+    const fltSemantics &Sem = FloatVT.getVectorElementType().getFltSemantics();
+    BitWidth = EltVT.getSizeInBits();
+    MantissaBits = APFloat::semanticsPrecision(Sem) - 1;
+    ExponentBias =
+        static_cast<unsigned>(-APFloat::semanticsMinExponent(Sem) + 1);
+  } 
+  else {
+    return SDValue();
+  }
-  } 
-  else {
-    return SDValue();
-  }
+  } else {
+    return SDValue();
+  }
-  } 
-  else {
-    return SDValue();
-  }
+  } else {
+    return SDValue();
+  }
+
+  // Handling the case for when Op == 0 which is stored in ZeroRes
+  CmpVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue Zero = DAG.getConstant(0, dl, VT);
+  SDValue IsZero = DAG.getSetCC(dl, CmpVT, Op, Zero, ISD::SETEQ);
+  SDValue ZeroRes = DAG.getConstant(BitWidth, dl, VT);
+
+  // Handling the case for Non-zero inputs using the algorithm mentioned below
+  SDValue Float = DAG.getNode(ISD::UINT_TO_FP, dl, FloatVT, Op);
+  SDValue FloatBits = DAG.getNode(ISD::BITCAST, dl, VT, Float);
+  SDValue Exp = DAG.getNode(ISD::SRL, dl, VT, FloatBits, DAG.getConstant(MantissaBits, dl, VT));
+  SDValue MSBIndex = DAG.getNode(ISD::SUB, dl, VT, Exp, DAG.getConstant(ExponentBias, dl, VT));
+  SDValue NonZeroRes = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(BitWidth - 1, dl, VT), MSBIndex);
+
+  //Returns the respective DAG Node based on the input being zero or non-zero
+  return DAG.getNode(ISD::VSELECT, dl, VT, IsZero, ZeroRes, NonZeroRes);
+}
+
 SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG,
                                         const SDLoc &DL, EVT VT, SDValue Op,
                                         unsigned BitWidth) const {

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1348,6 +1348,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SUB,                MVT::i32, Custom);
   }
 
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2() &&
+      !Subtarget.hasSSSE3()) {
+    setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom);
+  }
+
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
       setOperationAction(ISD::FFLOOR,            RoundedTy,  Legal);
@@ -29039,6 +29045,13 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
   if (VT.is512BitVector() && !Subtarget.hasBWI())
     return splitVectorIntUnary(Op, DAG, DL);
 
+  if (VT == MVT::v4i32 && Subtarget.hasSSE2() && !Subtarget.hasSSSE3()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    SDValue New = TLI.expandCTLZWithFP(Op.getNode(), DAG);
-    SDValue New = TLI.expandCTLZWithFP(Op.getNode(), DAG);
+    SDValue New = expandCTLZWithFP(Op.getNode(), DAG);
-    SDValue New = TLI.expandCTLZWithFP(Op.getNode(), DAG);
+    SDValue New = expandCTLZWithFP(Op.getNode(), DAG);
+    if (New.getNode())
-    if (New.getNode())
+    if (New)
-    if (New.getNode())
+    if (New)
+      return New;
+  }
+
   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
 }

diff --git a/llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s
+
+define <4 x i32> @test_v4i32_sse2(<4 x i32> %a) #0 {
+; CHECK-LABEL: test_v4i32_sse2:
+; CHECK:       # %bb.0:
+
+; Zero test (strict CTLZ needs select)
+; CHECK-DAG:   pcmpeqd %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+
+; Exponent extraction + bias arithmetic (order-free)
+; CHECK-DAG:   psrld {{\$}}23, %xmm{{[0-9]+}}
+; CHECK-DAG:   psubd %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+
+; Select/merge (could be por/pandn etc.)
+; CHECK:       por %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+
+; Must NOT use SSSE3 LUT path
+; CHECK-NOT:   pshufb
+
+; CHECK:       retq
+  %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
+attributes #0 = { "optnone" }
diff --git a/llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s
+
+define <4 x i32> @test_v4i32_sse2_zero_undef(<4 x i32> %a) #0 {
+; CHECK-LABEL: test_v4i32_sse2_zero_undef:
+
+; zero check
+; CHECK-DAG:   pcmpeqd
+
+; FP-based mantissa/exponent steps (order may vary)
+; CHECK-DAG:   psrld     $16
+; CHECK-DAG:   subps
+; CHECK-DAG:   psrld     $23
+; CHECK-DAG:   psubd
+
+; merge/select
+; CHECK:       pandn
+; CHECK:       por
+
+; CHECK-NOT:   pshufb
+
+; CHECK: retq
+
+  %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
+attributes #0 = { "optnone" }
diff --git a/llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 -o - | FileCheck %s
+
+; This verifies that **with SSSE3 enabled**, we use the LUT-based `pshufb`
+; implementation and *not* the floating-point exponent trick.
+
+define <4 x i32> @test_v4i32_ssse3(<4 x i32> %a) {
+; CHECK-LABEL: test_v4i32_ssse3:
+; CHECK:       # %bb.0:
+
+; Must use SSSE3 table LUT:
+; CHECK:       pshufb
+
+; Must NOT use FP exponent trick:
+; CHECK-NOT:   cvtdq2ps
+; CHECK-NOT:   psrld $23
+; CHECK-NOT:   psubd
+
+; CHECK:       retq
+  %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)