Skip to content

Commit f7cc34c

Browse files
author
Sjoerd Meijer
committed
[SelectionDAG] Codesize: don't expand SHIFT to SHIFT_PARTS
And instead just generate a libcall. My motivating example on ARM was a simple: shl i64 %A, %B for which the code bloat is quite significant. For other targets that also accept __int128/i128 such as AArch64 and X86, it is also beneficial for these cases to generate a libcall when optimising for minsize. On these 64-bit targets, the 64-bits shifts are of course unaffected because the SHIFT/SHIFT_PARTS lowering operation action is not set to custom/expand. Differential Revision: https://reviews.llvm.org/D57386 llvm-svn: 352736
1 parent a493843 commit f7cc34c

File tree

8 files changed

+320
-3
lines changed

8 files changed

+320
-3
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,13 @@ class TargetLoweringBase {
642642
return RepRegClassCostForVT[VT.SimpleTy];
643643
}
644644

645+
/// Return true if SHIFT instructions should be expanded to SHIFT_PARTS
646+
/// instructions, and false if a library call is preferred (e.g for code-size
647+
/// reasons).
648+
virtual bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
649+
return true;
650+
}
651+
645652
/// Return true if the target has native support for the specified value type.
646653
/// This means that it has a register that directly holds it without
647654
/// promotions or expansions.

llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2765,11 +2765,15 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
27652765
}
27662766

27672767
// Next check to see if the target supports this SHL_PARTS operation or if it
2768-
// will custom expand it.
2768+
// will custom expand it. Don't lower this to SHL_PARTS when we optimise for
2769+
// size, but create a libcall instead.
27692770
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
27702771
TargetLowering::LegalizeAction Action = TLI.getOperationAction(PartsOpc, NVT);
2771-
if ((Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) ||
2772-
Action == TargetLowering::Custom) {
2772+
const bool LegalOrCustom =
2773+
(Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) ||
2774+
Action == TargetLowering::Custom;
2775+
2776+
if (LegalOrCustom && TLI.shouldExpandShift(DAG, N)) {
27732777
// Expand the subcomponents.
27742778
SDValue LHSL, LHSH;
27752779
GetExpandedInteger(N->getOperand(0), LHSL, LHSH);

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,12 @@ class AArch64TargetLowering : public TargetLowering {
469469
return VT.getSizeInBits() >= 64; // vector 'bic'
470470
}
471471

472+
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
473+
if (DAG.getMachineFunction().getFunction().optForMinSize())
474+
return false;
475+
return true;
476+
}
477+
472478
bool shouldTransformSignedTruncationCheck(EVT XVT,
473479
unsigned KeptBits) const override {
474480
// For vectors, we don't have a preference..

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,12 @@ class VectorType;
567567
return HasStandaloneRem;
568568
}
569569

570+
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
571+
if (DAG.getMachineFunction().getFunction().optForMinSize())
572+
return false;
573+
return true;
574+
}
575+
570576
CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const;
571577
CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const;
572578

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -831,6 +831,12 @@ namespace llvm {
831831
return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
832832
}
833833

834+
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
835+
if (DAG.getMachineFunction().getFunction().optForMinSize())
836+
return false;
837+
return true;
838+
}
839+
834840
bool shouldSplatInsEltVarIndex(EVT VT) const override;
835841

836842
bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
3+
4+
define i64 @f0(i64 %val, i64 %amt) minsize optsize {
5+
; CHECK-LABEL: f0:
6+
; CHECK: // %bb.0:
7+
; CHECK-NEXT: lsl x0, x0, x1
8+
; CHECK-NEXT: ret
9+
%res = shl i64 %val, %amt
10+
ret i64 %res
11+
}
12+
13+
define i32 @f1(i64 %x, i64 %y) minsize optsize {
14+
; CHECK-LABEL: f1:
15+
; CHECK: // %bb.0:
16+
; CHECK-NEXT: lsl x0, x0, x1
17+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
18+
; CHECK-NEXT: ret
19+
%a = shl i64 %x, %y
20+
%b = trunc i64 %a to i32
21+
ret i32 %b
22+
}
23+
24+
define i32 @f2(i64 %x, i64 %y) minsize optsize {
25+
; CHECK-LABEL: f2:
26+
; CHECK: // %bb.0:
27+
; CHECK-NEXT: asr x0, x0, x1
28+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
29+
; CHECK-NEXT: ret
30+
%a = ashr i64 %x, %y
31+
%b = trunc i64 %a to i32
32+
ret i32 %b
33+
}
34+
35+
define i32 @f3(i64 %x, i64 %y) minsize optsize {
36+
; CHECK-LABEL: f3:
37+
; CHECK: // %bb.0:
38+
; CHECK-NEXT: lsr x0, x0, x1
39+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
40+
; CHECK-NEXT: ret
41+
%a = lshr i64 %x, %y
42+
%b = trunc i64 %a to i32
43+
ret i32 %b
44+
}
45+
46+
define dso_local { i64, i64 } @shl128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize {
47+
; CHECK-LABEL: shl128:
48+
; CHECK: // %bb.0: // %entry
49+
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
50+
; CHECK-NEXT: .cfi_def_cfa_offset 16
51+
; CHECK-NEXT: .cfi_offset w30, -16
52+
; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
53+
; CHECK-NEXT: bl __ashlti3
54+
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
55+
; CHECK-NEXT: ret
56+
entry:
57+
%x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128
58+
%x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64
59+
%x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128
60+
%x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext
61+
%conv = sext i8 %y to i32
62+
%sh_prom = zext i32 %conv to i128
63+
%shl = shl i128 %x.sroa.0.0.insert.insert, %sh_prom
64+
%retval.sroa.0.0.extract.trunc = trunc i128 %shl to i64
65+
%retval.sroa.2.0.extract.shift = lshr i128 %shl, 64
66+
%retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64
67+
%.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0
68+
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
69+
ret { i64, i64 } %.fca.1.insert
70+
}
71+
72+
define dso_local { i64, i64 } @ashr128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize {
73+
; CHECK-LABEL: ashr128:
74+
; CHECK: // %bb.0: // %entry
75+
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
76+
; CHECK-NEXT: .cfi_def_cfa_offset 16
77+
; CHECK-NEXT: .cfi_offset w30, -16
78+
; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
79+
; CHECK-NEXT: bl __ashrti3
80+
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
81+
; CHECK-NEXT: ret
82+
entry:
83+
%x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128
84+
%x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64
85+
%x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128
86+
%x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext
87+
%conv = sext i8 %y to i32
88+
%sh_prom = zext i32 %conv to i128
89+
%shr = ashr i128 %x.sroa.0.0.insert.insert, %sh_prom
90+
%retval.sroa.0.0.extract.trunc = trunc i128 %shr to i64
91+
%retval.sroa.2.0.extract.shift = lshr i128 %shr, 64
92+
%retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64
93+
%.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0
94+
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
95+
ret { i64, i64 } %.fca.1.insert
96+
}
97+
98+
define dso_local { i64, i64 } @lshr128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize {
99+
; CHECK-LABEL: lshr128:
100+
; CHECK: // %bb.0: // %entry
101+
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
102+
; CHECK-NEXT: .cfi_def_cfa_offset 16
103+
; CHECK-NEXT: .cfi_offset w30, -16
104+
; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
105+
; CHECK-NEXT: bl __lshrti3
106+
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
107+
; CHECK-NEXT: ret
108+
entry:
109+
%x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128
110+
%x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64
111+
%x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128
112+
%x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext
113+
%conv = sext i8 %y to i32
114+
%sh_prom = zext i32 %conv to i128
115+
%shr = lshr i128 %x.sroa.0.0.insert.insert, %sh_prom
116+
%retval.sroa.0.0.extract.trunc = trunc i128 %shr to i64
117+
%retval.sroa.2.0.extract.shift = lshr i128 %shr, 64
118+
%retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64
119+
%.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0
120+
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
121+
ret { i64, i64 } %.fca.1.insert
122+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
2+
3+
define i64 @f0(i64 %val, i64 %amt) minsize optsize {
4+
; CHECK-LABEL: f0:
5+
; CHECK: bl __aeabi_llsl
6+
%res = shl i64 %val, %amt
7+
ret i64 %res
8+
}
9+
10+
define i32 @f1(i64 %x, i64 %y) minsize optsize {
11+
; CHECK-LABEL: f1:
12+
; CHECK: bl __aeabi_llsl
13+
%a = shl i64 %x, %y
14+
%b = trunc i64 %a to i32
15+
ret i32 %b
16+
}
17+
18+
define i32 @f2(i64 %x, i64 %y) minsize optsize {
19+
; CHECK-LABEL: f2:
20+
; CHECK: bl __aeabi_lasr
21+
%a = ashr i64 %x, %y
22+
%b = trunc i64 %a to i32
23+
ret i32 %b
24+
}
25+
26+
define i32 @f3(i64 %x, i64 %y) minsize optsize {
27+
; CHECK-LABEL: f3:
28+
; CHECK: bl __aeabi_llsr
29+
%a = lshr i64 %x, %y
30+
%b = trunc i64 %a to i32
31+
ret i32 %b
32+
}
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s
3+
4+
define i64 @f0(i64 %val, i64 %amt) minsize optsize {
5+
; CHECK-LABEL: f0:
6+
; CHECK: # %bb.0:
7+
; CHECK-NEXT: movq %rsi, %rcx
8+
; CHECK-NEXT: movq %rdi, %rax
9+
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
10+
; CHECK-NEXT: shlq %cl, %rax
11+
; CHECK-NEXT: retq
12+
%res = shl i64 %val, %amt
13+
ret i64 %res
14+
}
15+
16+
define i32 @f1(i64 %x, i64 %y) minsize optsize {
17+
; CHECK-LABEL: f1:
18+
; CHECK: # %bb.0:
19+
; CHECK-NEXT: movq %rsi, %rcx
20+
; CHECK-NEXT: movq %rdi, %rax
21+
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
22+
; CHECK-NEXT: shlq %cl, %rax
23+
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
24+
; CHECK-NEXT: retq
25+
%a = shl i64 %x, %y
26+
%b = trunc i64 %a to i32
27+
ret i32 %b
28+
}
29+
30+
define i32 @f2(i64 %x, i64 %y) minsize optsize {
31+
; CHECK-LABEL: f2:
32+
; CHECK: # %bb.0:
33+
; CHECK-NEXT: movq %rsi, %rcx
34+
; CHECK-NEXT: movq %rdi, %rax
35+
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
36+
; CHECK-NEXT: sarq %cl, %rax
37+
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
38+
; CHECK-NEXT: retq
39+
%a = ashr i64 %x, %y
40+
%b = trunc i64 %a to i32
41+
ret i32 %b
42+
}
43+
44+
define i32 @f3(i64 %x, i64 %y) minsize optsize {
45+
; CHECK-LABEL: f3:
46+
; CHECK: # %bb.0:
47+
; CHECK-NEXT: movq %rsi, %rcx
48+
; CHECK-NEXT: movq %rdi, %rax
49+
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
50+
; CHECK-NEXT: shrq %cl, %rax
51+
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
52+
; CHECK-NEXT: retq
53+
%a = lshr i64 %x, %y
54+
%b = trunc i64 %a to i32
55+
ret i32 %b
56+
}
57+
58+
define dso_local { i64, i64 } @shl128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize {
59+
; CHECK-LABEL: shl128:
60+
; CHECK: # %bb.0: # %entry
61+
; CHECK-NEXT: pushq %rax
62+
; CHECK-NEXT: .cfi_def_cfa_offset 16
63+
; CHECK-NEXT: movzbl %dl, %edx
64+
; CHECK-NEXT: callq __ashlti3
65+
; CHECK-NEXT: popq %rcx
66+
; CHECK-NEXT: .cfi_def_cfa_offset 8
67+
; CHECK-NEXT: retq
68+
entry:
69+
%x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128
70+
%x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64
71+
%x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128
72+
%x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext
73+
%conv = sext i8 %y to i32
74+
%sh_prom = zext i32 %conv to i128
75+
%shl = shl i128 %x.sroa.0.0.insert.insert, %sh_prom
76+
%retval.sroa.0.0.extract.trunc = trunc i128 %shl to i64
77+
%retval.sroa.2.0.extract.shift = lshr i128 %shl, 64
78+
%retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64
79+
%.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0
80+
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
81+
ret { i64, i64 } %.fca.1.insert
82+
}
83+
84+
define dso_local { i64, i64 } @ashr128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize {
85+
; CHECK-LABEL: ashr128:
86+
; CHECK: # %bb.0: # %entry
87+
; CHECK-NEXT: pushq %rax
88+
; CHECK-NEXT: .cfi_def_cfa_offset 16
89+
; CHECK-NEXT: callq __ashrti3
90+
; CHECK-NEXT: popq %rcx
91+
; CHECK-NEXT: .cfi_def_cfa_offset 8
92+
; CHECK-NEXT: retq
93+
entry:
94+
%x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128
95+
%x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64
96+
%x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128
97+
%x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext
98+
%conv = sext i8 %y to i32
99+
%sh_prom = zext i32 %conv to i128
100+
%shr = ashr i128 %x.sroa.0.0.insert.insert, %sh_prom
101+
%retval.sroa.0.0.extract.trunc = trunc i128 %shr to i64
102+
%retval.sroa.2.0.extract.shift = lshr i128 %shr, 64
103+
%retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64
104+
%.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0
105+
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
106+
ret { i64, i64 } %.fca.1.insert
107+
}
108+
109+
define dso_local { i64, i64 } @lshr128(i64 %x.coerce0, i64 %x.coerce1, i8 signext %y) minsize optsize {
110+
; CHECK-LABEL: lshr128:
111+
; CHECK: # %bb.0: # %entry
112+
; CHECK-NEXT: pushq %rax
113+
; CHECK-NEXT: .cfi_def_cfa_offset 16
114+
; CHECK-NEXT: movzbl %dl, %edx
115+
; CHECK-NEXT: callq __lshrti3
116+
; CHECK-NEXT: popq %rcx
117+
; CHECK-NEXT: .cfi_def_cfa_offset 8
118+
; CHECK-NEXT: retq
119+
entry:
120+
%x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128
121+
%x.sroa.2.0.insert.shift = shl nuw i128 %x.sroa.2.0.insert.ext, 64
122+
%x.sroa.0.0.insert.ext = zext i64 %x.coerce0 to i128
123+
%x.sroa.0.0.insert.insert = or i128 %x.sroa.2.0.insert.shift, %x.sroa.0.0.insert.ext
124+
%conv = sext i8 %y to i32
125+
%sh_prom = zext i32 %conv to i128
126+
%shr = lshr i128 %x.sroa.0.0.insert.insert, %sh_prom
127+
%retval.sroa.0.0.extract.trunc = trunc i128 %shr to i64
128+
%retval.sroa.2.0.extract.shift = lshr i128 %shr, 64
129+
%retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64
130+
%.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0
131+
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
132+
ret { i64, i64 } %.fca.1.insert
133+
}
134+

0 commit comments

Comments
 (0)