Skip to content

Commit 506bc0c

Browse files
committed
[X86] Truncate i64 sub to i32 when upper 33 bits are zeros
1 parent 20becf3 commit 506bc0c

File tree

3 files changed

+214
-0
lines changed

3 files changed

+214
-0
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58064,8 +58064,28 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
5806458064
EVT VT = N->getValueType(0);
5806558065
SDValue Op0 = N->getOperand(0);
5806658066
SDValue Op1 = N->getOperand(1);
58067+
unsigned int Opcode = N->getOpcode();
5806758068
SDLoc DL(N);
5806858069

58070+
// Use a 32-bit sub+zext if upper 33 bits known zero.
58071+
if (VT == MVT::i64 && Subtarget.is64Bit()) {
58072+
APInt HiMask = APInt::getHighBitsSet(64, 33);
58073+
if (DAG.MaskedValueIsZero(Op0, HiMask) &&
58074+
DAG.MaskedValueIsZero(Op1, HiMask)) {
58075+
SDValue LHS = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op0);
58076+
SDValue RHS = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
58077+
bool NUW = Op0->getFlags().hasNoUnsignedWrap();
58078+
NUW = NUW & DAG.willNotOverflowAdd(false, LHS, RHS);
58079+
SDNodeFlags Flags;
58080+
Flags.setNoUnsignedWrap(NUW);
58081+
// Always true since in the worst case 0 - 2147483647 = -2147483647, still
58082+
// fits in i32
58083+
Flags.setNoSignedWrap(true);
58084+
SDValue Sub = DAG.getNode(Opcode, DL, MVT::i32, LHS, RHS, Flags);
58085+
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Sub);
58086+
}
58087+
}
58088+
5806958089
auto IsNonOpaqueConstant = [&](SDValue Op) {
5807058090
return DAG.isConstantIntBuildVectorOrConstantInt(Op,
5807158091
/*AllowOpaques*/ false);
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=SSE
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
6+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1
7+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
8+
9+
define <2 x i64> @test1(ptr%ptr) {
10+
; SSE-LABEL: test1:
11+
; SSE: # %bb.0: # %entry
12+
; SSE-NEXT: movzbl (%rdi), %eax
13+
; SSE-NEXT: movzbl %al, %ecx
14+
; SSE-NEXT: shrb %al
15+
; SSE-NEXT: movzbl %al, %eax
16+
; SSE-NEXT: negl %eax
17+
; SSE-NEXT: movd %eax, %xmm1
18+
; SSE-NEXT: andl $1, %ecx
19+
; SSE-NEXT: negl %ecx
20+
; SSE-NEXT: movd %ecx, %xmm0
21+
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
22+
; SSE-NEXT: retq
23+
;
24+
; SSE2-LABEL: test1:
25+
; SSE2: # %bb.0: # %entry
26+
; SSE2-NEXT: movzbl (%rdi), %eax
27+
; SSE2-NEXT: movzbl %al, %ecx
28+
; SSE2-NEXT: shrb %al
29+
; SSE2-NEXT: movzbl %al, %eax
30+
; SSE2-NEXT: negl %eax
31+
; SSE2-NEXT: movd %eax, %xmm1
32+
; SSE2-NEXT: andl $1, %ecx
33+
; SSE2-NEXT: negl %ecx
34+
; SSE2-NEXT: movd %ecx, %xmm0
35+
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
36+
; SSE2-NEXT: retq
37+
;
38+
; SSSE3-LABEL: test1:
39+
; SSSE3: # %bb.0: # %entry
40+
; SSSE3-NEXT: movzbl (%rdi), %eax
41+
; SSSE3-NEXT: movzbl %al, %ecx
42+
; SSSE3-NEXT: shrb %al
43+
; SSSE3-NEXT: movzbl %al, %eax
44+
; SSSE3-NEXT: negl %eax
45+
; SSSE3-NEXT: movd %eax, %xmm1
46+
; SSSE3-NEXT: andl $1, %ecx
47+
; SSSE3-NEXT: negl %ecx
48+
; SSSE3-NEXT: movd %ecx, %xmm0
49+
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
50+
; SSSE3-NEXT: retq
51+
;
52+
; SSE41-LABEL: test1:
53+
; SSE41: # %bb.0: # %entry
54+
; SSE41-NEXT: movzbl (%rdi), %eax
55+
; SSE41-NEXT: movzbl %al, %ecx
56+
; SSE41-NEXT: shrb %al
57+
; SSE41-NEXT: movzbl %al, %eax
58+
; SSE41-NEXT: negl %eax
59+
; SSE41-NEXT: movd %eax, %xmm1
60+
; SSE41-NEXT: andl $1, %ecx
61+
; SSE41-NEXT: negl %ecx
62+
; SSE41-NEXT: movd %ecx, %xmm0
63+
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
64+
; SSE41-NEXT: retq
65+
;
66+
; AVX1-LABEL: test1:
67+
; AVX1: # %bb.0: # %entry
68+
; AVX1-NEXT: movzbl (%rdi), %eax
69+
; AVX1-NEXT: movzbl %al, %ecx
70+
; AVX1-NEXT: shrb %al
71+
; AVX1-NEXT: movzbl %al, %eax
72+
; AVX1-NEXT: negl %eax
73+
; AVX1-NEXT: vmovd %eax, %xmm0
74+
; AVX1-NEXT: andl $1, %ecx
75+
; AVX1-NEXT: negl %ecx
76+
; AVX1-NEXT: vmovd %ecx, %xmm1
77+
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
78+
; AVX1-NEXT: retq
79+
;
80+
; AVX2-LABEL: test1:
81+
; AVX2: # %bb.0: # %entry
82+
; AVX2-NEXT: movzbl (%rdi), %eax
83+
; AVX2-NEXT: movzbl %al, %ecx
84+
; AVX2-NEXT: shrb %al
85+
; AVX2-NEXT: movzbl %al, %eax
86+
; AVX2-NEXT: negl %eax
87+
; AVX2-NEXT: vmovd %eax, %xmm0
88+
; AVX2-NEXT: andl $1, %ecx
89+
; AVX2-NEXT: negl %ecx
90+
; AVX2-NEXT: vmovd %ecx, %xmm1
91+
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
92+
; AVX2-NEXT: retq
93+
entry:
94+
%X = load <2 x i1>, ptr %ptr
95+
%Y = sext <2 x i1> %X to <2 x i64>
96+
ret <2 x i64> %Y
97+
}
98+
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=i686-unknown-unknown -disable-cgp-branch-opts | FileCheck %s --check-prefix=X86
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -disable-cgp-branch-opts | FileCheck %s --check-prefix=X64
4+
5+
; Truncate to 32 bit subtraction since first 48 bits are zeros
6+
define i64 @test1(i16 %a, i16 %b) nounwind {
7+
; X86-LABEL: test1:
8+
; X86: # %bb.0:
9+
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
10+
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
11+
; X86-NEXT: xorl %edx, %edx
12+
; X86-NEXT: subl %ecx, %eax
13+
; X86-NEXT: sbbl %edx, %edx
14+
; X86-NEXT: retl
15+
;
16+
; X64-LABEL: test1:
17+
; X64: # %bb.0:
18+
; X64-NEXT: movzwl %si, %ecx
19+
; X64-NEXT: movzwl %di, %eax
20+
; X64-NEXT: subl %ecx, %eax
21+
; X64-NEXT: retq
22+
%zext_a = zext i16 %a to i64
23+
%zext_b = zext i16 %b to i64
24+
%sub = sub i64 %zext_a, %zext_b
25+
ret i64 %sub
26+
}
27+
28+
; Do not truncate to 32 bit subtraction if 32nd bit is set
29+
define i64 @test2(i16 %a, i16 %b) nounwind {
30+
; X86-LABEL: test2:
31+
; X86: # %bb.0:
32+
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
33+
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
34+
; X86-NEXT: subl %ecx, %eax
35+
; X86-NEXT: movl $1, %edx
36+
; X86-NEXT: sbbl $0, %edx
37+
; X86-NEXT: retl
38+
;
39+
; X64-LABEL: test2:
40+
; X64: # %bb.0:
41+
; X64-NEXT: movzwl %di, %ecx
42+
; X64-NEXT: movzwl %si, %edx
43+
; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
44+
; X64-NEXT: orq %rcx, %rax
45+
; X64-NEXT: subq %rdx, %rax
46+
; X64-NEXT: retq
47+
%zext_a = zext i16 %a to i64
48+
%zext_b = zext i16 %b to i64
49+
%or_a = or i64 %zext_a, 4294967296
50+
%sub = sub i64 %or_a, %zext_b
51+
ret i64 %sub
52+
}
53+
54+
; Do not truncate to 32 bit subtraction in case of sign extension
55+
define i64 @test3(i16 %a, i16 %b) nounwind {
56+
; X86-LABEL: test3:
57+
; X86: # %bb.0:
58+
; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
59+
; X86-NEXT: movl %eax, %edx
60+
; X86-NEXT: sarl $31, %edx
61+
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
62+
; X86-NEXT: subl %ecx, %eax
63+
; X86-NEXT: sbbl $0, %edx
64+
; X86-NEXT: retl
65+
;
66+
; X64-LABEL: test3:
67+
; X64: # %bb.0:
68+
; X64-NEXT: # kill: def $edi killed $edi def $rdi
69+
; X64-NEXT: movswq %di, %rax
70+
; X64-NEXT: movzwl %si, %ecx
71+
; X64-NEXT: subq %rcx, %rax
72+
; X64-NEXT: retq
73+
%sext_a = sext i16 %a to i64
74+
%zext_b = zext i16 %b to i64
75+
%sub = sub i64 %sext_a, %zext_b
76+
ret i64 %sub
77+
}
78+
79+
define i64 @test4(i16 %x) nounwind {
80+
; X86-LABEL: test4:
81+
; X86: # %bb.0:
82+
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
83+
; X86-NEXT: xorl %edx, %edx
84+
; X86-NEXT: negl %eax
85+
; X86-NEXT: sbbl %edx, %edx
86+
; X86-NEXT: retl
87+
;
88+
; X64-LABEL: test4:
89+
; X64: # %bb.0:
90+
; X64-NEXT: movzwl %di, %eax
91+
; X64-NEXT: negl %eax
92+
; X64-NEXT: retq
93+
%zext_x = zext i16 %x to i64
94+
%sub = sub i64 0, %zext_x
95+
ret i64 %sub
96+
}

0 commit comments

Comments
 (0)