Skip to content

Commit 078cc7a

Browse files
committed
[X86] Add SSE2 FP-based v4i32 CTLZ lowering and tests
1 parent 8b1bb2f commit 078cc7a

File tree

5 files changed

+103
-10
lines changed

5 files changed

+103
-10
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "llvm/CodeGen/TargetLowering.h"
14+
#include "llvm/ADT/APFloat.h"
1415
#include "llvm/ADT/STLExtras.h"
1516
#include "llvm/Analysis/ValueTracking.h"
1617
#include "llvm/Analysis/VectorUtils.h"
@@ -9482,6 +9483,13 @@ SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
94829483

94839484

94849485
SDValue TargetLowering::expandCTLZWithFP(SDNode *Node, SelectionDAG &DAG) const {
9486+
// pseudocode :
9487+
// if(x==0) return 32;
9488+
// float f = (float) x;
9489+
// int i = bitcast<int>(f);
9490+
// int ilog2 = (i >> 23) - 127;
9491+
// return 31 - ilog2;
9492+
94859493
SDLoc dl(Node);
94869494
SDValue Op = Node->getOperand(0);
94879495
EVT VT = Op.getValueType();
@@ -9495,9 +9503,11 @@ SDValue TargetLowering::expandCTLZWithFP(SDNode *Node, SelectionDAG &DAG) const
94959503
// Converting to float type
94969504
if (EltVT == MVT::i32) {
94979505
FloatVT = VT.changeVectorElementType(MVT::f32);
9498-
BitWidth = 32;
9499-
MantissaBits = 23;
9500-
ExponentBias = 127;
9506+
const fltSemantics &Sem = FloatVT.getVectorElementType().getFltSemantics();
9507+
BitWidth = EltVT.getSizeInBits();
9508+
MantissaBits = APFloat::semanticsPrecision(Sem) - 1;
9509+
ExponentBias =
9510+
static_cast<unsigned>(-APFloat::semanticsMinExponent(Sem) + 1);
95019511
}
95029512
else {
95039513
return SDValue();
@@ -9518,13 +9528,6 @@ SDValue TargetLowering::expandCTLZWithFP(SDNode *Node, SelectionDAG &DAG) const
95189528

95199529
//Returns the respective DAG Node based on the input being zero or non-zero
95209530
return DAG.getNode(ISD::VSELECT, dl, VT, IsZero, ZeroRes, NonZeroRes);
9521-
9522-
// pseudocode :
9523-
// if(x==0) return 32;
9524-
// float f = (float) x;
9525-
// int i = bitcast<int>(f);
9526-
// int ilog2 = (i >> 23) - 127;
9527-
// return 31 - ilog2;
95289531
}
95299532

95309533
SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG,

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,6 +1348,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
13481348
setOperationAction(ISD::SUB, MVT::i32, Custom);
13491349
}
13501350

1351+
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2() &&
1352+
!Subtarget.hasSSSE3()) {
1353+
setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1354+
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom);
1355+
}
1356+
13511357
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
13521358
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
13531359
setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
@@ -29039,6 +29045,13 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
2903929045
if (VT.is512BitVector() && !Subtarget.hasBWI())
2904029046
return splitVectorIntUnary(Op, DAG, DL);
2904129047

29048+
if (VT == MVT::v4i32 && Subtarget.hasSSE2() && !Subtarget.hasSSSE3()) {
29049+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29050+
SDValue New = TLI.expandCTLZWithFP(Op.getNode(), DAG);
29051+
if (New.getNode())
29052+
return New;
29053+
}
29054+
2904229055
assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
2904329056
return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
2904429057
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s
2+
3+
define <4 x i32> @test_v4i32_sse2(<4 x i32> %a) #0 {
4+
; CHECK-LABEL: test_v4i32_sse2:
5+
; CHECK: # %bb.0:
6+
7+
; Zero test (strict CTLZ needs select)
8+
; CHECK-DAG: pcmpeqd %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
9+
10+
; Exponent extraction + bias arithmetic (order-free)
11+
; CHECK-DAG: psrld {{\$}}23, %xmm{{[0-9]+}}
12+
; CHECK-DAG: psubd %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
13+
14+
; Select/merge (could be por/pandn etc.)
15+
; CHECK: por %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
16+
17+
; Must NOT use SSSE3 LUT path
18+
; CHECK-NOT: pshufb
19+
20+
; CHECK: retq
21+
%res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
22+
ret <4 x i32> %res
23+
}
24+
25+
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
26+
attributes #0 = { "optnone" }
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s
2+
3+
define <4 x i32> @test_v4i32_sse2_zero_undef(<4 x i32> %a) #0 {
4+
; CHECK-LABEL: test_v4i32_sse2_zero_undef:
5+
6+
; zero check
7+
; CHECK-DAG: pcmpeqd
8+
9+
; FP-based mantissa/exponent steps (order may vary)
10+
; CHECK-DAG: psrld $16
11+
; CHECK-DAG: subps
12+
; CHECK-DAG: psrld $23
13+
; CHECK-DAG: psubd
14+
15+
; merge/select
16+
; CHECK: pandn
17+
; CHECK: por
18+
19+
; CHECK-NOT: pshufb
20+
21+
; CHECK: retq
22+
23+
%res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
24+
ret <4 x i32> %res
25+
}
26+
27+
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
28+
attributes #0 = { "optnone" }
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 -o - | FileCheck %s
2+
3+
; This verifies that **with SSSE3 enabled**, we use the LUT-based `pshufb`
4+
; implementation and *not* the floating-point exponent trick.
5+
6+
define <4 x i32> @test_v4i32_ssse3(<4 x i32> %a) {
7+
; CHECK-LABEL: test_v4i32_ssse3:
8+
; CHECK: # %bb.0:
9+
10+
; Must use SSSE3 table LUT:
11+
; CHECK: pshufb
12+
13+
; Must NOT use FP exponent trick:
14+
; CHECK-NOT: cvtdq2ps
15+
; CHECK-NOT: psrld $23
16+
; CHECK-NOT: psubd
17+
18+
; CHECK: retq
19+
%res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
20+
ret <4 x i32> %res
21+
}
22+
23+
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)

0 commit comments

Comments
 (0)