Skip to content

Commit e712022

Browse files
committed
[X86] Add SSE2 FP-based v4i32 CTLZ lowering and tests
1 parent 8b1bb2f commit e712022

File tree

4 files changed

+90
-0
lines changed

4 files changed

+90
-0
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,6 +1348,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
13481348
setOperationAction(ISD::SUB, MVT::i32, Custom);
13491349
}
13501350

1351+
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2() &&
1352+
!Subtarget.hasSSSE3()) {
1353+
setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1354+
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom);
1355+
}
1356+
13511357
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
13521358
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
13531359
setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
@@ -29039,6 +29045,13 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
2903929045
if (VT.is512BitVector() && !Subtarget.hasBWI())
2904029046
return splitVectorIntUnary(Op, DAG, DL);
2904129047

29048+
if (VT == MVT::v4i32 && Subtarget.hasSSE2() && !Subtarget.hasSSSE3()) {
29049+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29050+
SDValue New = TLI.expandCTLZWithFP(Op.getNode(), DAG);
29051+
if (New.getNode())
29052+
return New;
29053+
}
29054+
2904229055
assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
2904329056
return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
2904429057
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s
2+
3+
define <4 x i32> @test_v4i32_sse2(<4 x i32> %a) #0 {
4+
; CHECK-LABEL: test_v4i32_sse2:
5+
; CHECK: # %bb.0:
6+
7+
; Zero test (strict CTLZ needs select)
8+
; CHECK-DAG: pcmpeqd %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
9+
10+
; Exponent extraction + bias arithmetic (order-free)
11+
; CHECK-DAG: psrld {{\$}}23, %xmm{{[0-9]+}}
12+
; CHECK-DAG: psubd %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
13+
14+
; Select/merge (could be por/pandn etc.)
15+
; CHECK: por %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
16+
17+
; Must NOT use SSSE3 LUT path
18+
; CHECK-NOT: pshufb
19+
20+
; CHECK: retq
21+
%res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
22+
ret <4 x i32> %res
23+
}
24+
25+
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
26+
attributes #0 = { "optnone" }
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s
2+
3+
define <4 x i32> @test_v4i32_sse2_zero_undef(<4 x i32> %a) #0 {
4+
; CHECK-LABEL: test_v4i32_sse2_zero_undef:
5+
6+
; zero check
7+
; CHECK-DAG: pcmpeqd
8+
9+
; FP-based mantissa/exponent steps (order may vary)
10+
; CHECK-DAG: psrld $16
11+
; CHECK-DAG: subps
12+
; CHECK-DAG: psrld $23
13+
; CHECK-DAG: psubd
14+
15+
; merge/select
16+
; CHECK: pandn
17+
; CHECK: por
18+
19+
; CHECK-NOT: pshufb
20+
21+
; CHECK: retq
22+
23+
%res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
24+
ret <4 x i32> %res
25+
}
26+
27+
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
28+
attributes #0 = { "optnone" }
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 -o - | FileCheck %s
2+
3+
; This verifies that **with SSSE3 enabled**, we use the LUT-based `pshufb`
4+
; implementation and *not* the floating-point exponent trick.
5+
6+
define <4 x i32> @test_v4i32_ssse3(<4 x i32> %a) {
7+
; CHECK-LABEL: test_v4i32_ssse3:
8+
; CHECK: # %bb.0:
9+
10+
; Must use SSSE3 table LUT:
11+
; CHECK: pshufb
12+
13+
; Must NOT use FP exponent trick:
14+
; CHECK-NOT: cvtdq2ps
15+
; CHECK-NOT: psrld $23
16+
; CHECK-NOT: psubd
17+
18+
; CHECK: retq
19+
%res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
20+
ret <4 x i32> %res
21+
}
22+
23+
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)

0 commit comments

Comments
 (0)