Skip to content

Commit 0eb0a65

Browse files
[AArch64] Correctly determine if {ADD,SUB}{W,X}rs instructions are cheap
These are marked to be "as cheap as a move". According to publicly available Software Optimization Guides, they have one cycle latency and maximum throughput only on some microarchitectures, only for `LSL` and only for some shift amounts. This patch uses the subtarget feature `FeatureALULSLFast` to determine how cheap the instructions are. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D152827 Change-Id: I8f0d7e79bcf277ebf959719991c29a1bc7829486
1 parent d2b71c7 commit 0eb0a65

File tree

2 files changed

+139
-0
lines changed

2 files changed

+139
-0
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -904,6 +904,13 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
904904
switch (MI.getOpcode()) {
905905
default:
906906
return MI.isAsCheapAsAMove();
907+
908+
case AArch64::ADDWrs:
909+
case AArch64::ADDXrs:
910+
case AArch64::SUBWrs:
911+
case AArch64::SUBXrs:
912+
return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
913+
907914
// If MOVi32imm or MOVi64imm can be expanded into ORRWri or
908915
// ORRXri, it is as cheap as MOV.
909916
// Likewise if it can be expanded to MOVZ/MOVN/MOVK.
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc < %s -o - | FileCheck %s
3+
; RUN: llc -mattr=+alu-lsl-fast < %s -o - | FileCheck %s -check-prefix=LSLFAST
4+
target triple = "aarch64-linux"
5+
6+
declare void @g(...)
7+
8+
; Check that ADDWrs/ADDXrs with shift > 4 is considered relatively
9+
; slow, thus CSE-d.
10+
define void @f0(i1 %c0, i1 %c1, ptr %a, i64 %i) {
11+
; CHECK-LABEL: f0:
12+
; CHECK: // %bb.0: // %E
13+
; CHECK-NEXT: tbz w0, #0, .LBB0_5
14+
; CHECK-NEXT: // %bb.1: // %A
15+
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
16+
; CHECK-NEXT: .cfi_def_cfa_offset 16
17+
; CHECK-NEXT: .cfi_offset w30, -16
18+
; CHECK-NEXT: add x0, x2, x3, lsl #5
19+
; CHECK-NEXT: tbz w1, #0, .LBB0_3
20+
; CHECK-NEXT: // %bb.2: // %B
21+
; CHECK-NEXT: bl g
22+
; CHECK-NEXT: b .LBB0_4
23+
; CHECK-NEXT: .LBB0_3: // %C
24+
; CHECK-NEXT: mov x1, x0
25+
; CHECK-NEXT: bl g
26+
; CHECK-NEXT: .LBB0_4:
27+
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
28+
; CHECK-NEXT: .LBB0_5: // %X
29+
; CHECK-NEXT: ret
30+
;
31+
; LSLFAST-LABEL: f0:
32+
; LSLFAST: // %bb.0: // %E
33+
; LSLFAST-NEXT: tbz w0, #0, .LBB0_5
34+
; LSLFAST-NEXT: // %bb.1: // %A
35+
; LSLFAST-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
36+
; LSLFAST-NEXT: .cfi_def_cfa_offset 16
37+
; LSLFAST-NEXT: .cfi_offset w30, -16
38+
; LSLFAST-NEXT: add x0, x2, x3, lsl #5
39+
; LSLFAST-NEXT: tbz w1, #0, .LBB0_3
40+
; LSLFAST-NEXT: // %bb.2: // %B
41+
; LSLFAST-NEXT: bl g
42+
; LSLFAST-NEXT: b .LBB0_4
43+
; LSLFAST-NEXT: .LBB0_3: // %C
44+
; LSLFAST-NEXT: mov x1, x0
45+
; LSLFAST-NEXT: bl g
46+
; LSLFAST-NEXT: .LBB0_4:
47+
; LSLFAST-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
48+
; LSLFAST-NEXT: .LBB0_5: // %X
49+
; LSLFAST-NEXT: ret
50+
E:
51+
%p0 = getelementptr {i64, i64, i64, i64}, ptr %a, i64 %i
52+
br i1 %c0, label %A, label %X
53+
54+
A:
55+
br i1 %c1, label %B, label %C
56+
57+
B:
58+
call void @g(ptr %p0)
59+
br label %X
60+
61+
C:
62+
%p1 = getelementptr {i64, i64, i64, i64}, ptr %a, i64 %i
63+
call void @g(ptr %p1, ptr %p0)
64+
br label %X
65+
66+
X:
67+
ret void
68+
}
69+
70+
; Check that ADDWrs/ADDXrs with shift <= 4 is considered relatively fast on sub-targets
71+
; with feature +alu-lsl-fast, thus *not* CSE-d.
72+
define void @f1(i1 %c0, i1 %c1, ptr %a, i64 %i) {
73+
; CHECK-LABEL: f1:
74+
; CHECK: // %bb.0: // %E
75+
; CHECK-NEXT: tbz w0, #0, .LBB1_5
76+
; CHECK-NEXT: // %bb.1: // %A
77+
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
78+
; CHECK-NEXT: .cfi_def_cfa_offset 16
79+
; CHECK-NEXT: .cfi_offset w30, -16
80+
; CHECK-NEXT: add x0, x2, x3, lsl #4
81+
; CHECK-NEXT: tbz w1, #0, .LBB1_3
82+
; CHECK-NEXT: // %bb.2: // %B
83+
; CHECK-NEXT: bl g
84+
; CHECK-NEXT: b .LBB1_4
85+
; CHECK-NEXT: .LBB1_3: // %C
86+
; CHECK-NEXT: mov x1, x0
87+
; CHECK-NEXT: bl g
88+
; CHECK-NEXT: .LBB1_4:
89+
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
90+
; CHECK-NEXT: .LBB1_5: // %X
91+
; CHECK-NEXT: ret
92+
;
93+
; LSLFAST-LABEL: f1:
94+
; LSLFAST: // %bb.0: // %E
95+
; LSLFAST-NEXT: tbz w0, #0, .LBB1_5
96+
; LSLFAST-NEXT: // %bb.1: // %A
97+
; LSLFAST-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
98+
; LSLFAST-NEXT: .cfi_def_cfa_offset 16
99+
; LSLFAST-NEXT: .cfi_offset w30, -16
100+
; LSLFAST-NEXT: add x8, x2, x3, lsl #4
101+
; LSLFAST-NEXT: tbz w1, #0, .LBB1_3
102+
; LSLFAST-NEXT: // %bb.2: // %B
103+
; LSLFAST-NEXT: mov x0, x8
104+
; LSLFAST-NEXT: bl g
105+
; LSLFAST-NEXT: b .LBB1_4
106+
; LSLFAST-NEXT: .LBB1_3: // %C
107+
; LSLFAST-NEXT: add x0, x2, x3, lsl #4
108+
; LSLFAST-NEXT: mov x1, x8
109+
; LSLFAST-NEXT: bl g
110+
; LSLFAST-NEXT: .LBB1_4:
111+
; LSLFAST-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
112+
; LSLFAST-NEXT: .LBB1_5: // %X
113+
; LSLFAST-NEXT: ret
114+
E:
115+
%p0 = getelementptr {i64, i64}, ptr %a, i64 %i
116+
br i1 %c0, label %A, label %X
117+
118+
A:
119+
br i1 %c1, label %B, label %C
120+
121+
B:
122+
call void @g(ptr %p0)
123+
br label %X
124+
125+
C:
126+
%p1 = getelementptr {i64, i64}, ptr %a, i64 %i
127+
call void @g(ptr %p1, ptr %p0)
128+
br label %X
129+
130+
X:
131+
ret void
132+
}

0 commit comments

Comments
 (0)