Skip to content

Commit 7d0a208

Browse files
authored
[AArch64] Treat COPY between cross-register banks as expensive (#167661)
The motivation is to allow passes such as MachineLICM to hoist trivial FMOV instructions out of loops, where previously it didn't do so even when the RHS is a constant. On most architectures, these expensive move instructions have a latency of 2-6 cycles, and certainly not cheap as a 0-1 cycle move.
1 parent 909c9aa commit 7d0a208

File tree

2 files changed

+222
-0
lines changed

2 files changed

+222
-0
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,6 +1144,28 @@ static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
11441144
return Is.size() <= 2;
11451145
}
11461146

1147+
// Check if a COPY instruction is cheap.
1148+
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1149+
assert(MI.isCopy() && "Expected COPY instruction");
1150+
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1151+
1152+
// Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1153+
// typically requiring an FMOV instruction with a 2-6 cycle latency.
1154+
auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1155+
if (Reg.isVirtual())
1156+
return MRI.getRegClass(Reg);
1157+
if (Reg.isPhysical())
1158+
return RI.getMinimalPhysRegClass(Reg);
1159+
return nullptr;
1160+
};
1161+
const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1162+
const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1163+
if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1164+
return false;
1165+
1166+
return MI.isAsCheapAsAMove();
1167+
}
1168+
11471169
// FIXME: this implementation should be micro-architecture dependent, so a
11481170
// micro-architecture target hook should be introduced here in future.
11491171
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
@@ -1157,6 +1179,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
11571179
default:
11581180
return MI.isAsCheapAsAMove();
11591181

1182+
case TargetOpcode::COPY:
1183+
return isCheapCopy(MI, RI);
1184+
11601185
case AArch64::ADDWrs:
11611186
case AArch64::ADDXrs:
11621187
case AArch64::SUBWrs:
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=aarch64 -run-pass=early-machinelicm -o - %s | FileCheck %s
3+
4+
# This test verifies that cross-bank copies (e.g., GPR to FPR, FPR to GPR)
5+
# are hoisted out of loops by MachineLICM, as they are expensive on AArch64.
6+
7+
--- |
8+
declare void @use_float(float)
9+
declare void @use_int(i32)
10+
11+
define void @gpr_to_fpr_virtual_copy_hoisted() {
12+
ret void
13+
}
14+
15+
define void @gpr_to_fpr_physical_copy_hoisted() {
16+
ret void
17+
}
18+
19+
define void @fpr_to_gpr_virtual_copy_hoisted() {
20+
ret void
21+
}
22+
...
23+
---
24+
name: gpr_to_fpr_virtual_copy_hoisted
25+
tracksRegLiveness: true
26+
body: |
27+
; CHECK-LABEL: name: gpr_to_fpr_virtual_copy_hoisted
28+
; CHECK: bb.0:
29+
; CHECK-NEXT: successors: %bb.1(0x80000000)
30+
; CHECK-NEXT: liveins: $w0, $w1
31+
; CHECK-NEXT: {{ $}}
32+
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
33+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
34+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr
35+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]]
36+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[COPY1]]
37+
; CHECK-NEXT: {{ $}}
38+
; CHECK-NEXT: bb.1:
39+
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
40+
; CHECK-NEXT: {{ $}}
41+
; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2
42+
; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
43+
; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv
44+
; CHECK-NEXT: B %bb.2
45+
; CHECK-NEXT: {{ $}}
46+
; CHECK-NEXT: bb.2:
47+
; CHECK-NEXT: successors: %bb.1(0x80000000)
48+
; CHECK-NEXT: {{ $}}
49+
; CHECK-NEXT: $s0 = COPY [[COPY4]]
50+
; CHECK-NEXT: BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
51+
; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
52+
; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
53+
; CHECK-NEXT: B %bb.1
54+
; CHECK-NEXT: {{ $}}
55+
; CHECK-NEXT: bb.3:
56+
; CHECK-NEXT: RET_ReallyLR
57+
bb.0:
58+
liveins: $w0, $w1
59+
%1:gpr32 = COPY $w0
60+
%0:gpr32 = COPY $w1
61+
%3:gpr32all = COPY $wzr
62+
%2:gpr32all = COPY %3:gpr32all
63+
64+
bb.1:
65+
%4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
66+
%6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
67+
Bcc 1, %bb.3, implicit $nzcv
68+
B %bb.2
69+
70+
bb.2:
71+
%7:fpr32 = COPY %0:gpr32
72+
$s0 = COPY %7:fpr32
73+
BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
74+
%8:gpr32sp = ADDWri %4:gpr32common, 1, 0
75+
%5:gpr32all = COPY %8:gpr32sp
76+
B %bb.1
77+
78+
bb.3:
79+
RET_ReallyLR
80+
81+
...
82+
---
83+
name: gpr_to_fpr_physical_copy_hoisted
84+
tracksRegLiveness: true
85+
body: |
86+
; CHECK-LABEL: name: gpr_to_fpr_physical_copy_hoisted
87+
; CHECK: bb.0:
88+
; CHECK-NEXT: successors: %bb.1(0x80000000)
89+
; CHECK-NEXT: liveins: $w0
90+
; CHECK-NEXT: {{ $}}
91+
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
92+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32all = COPY $wzr
93+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY [[COPY1]]
94+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY $wzr
95+
; CHECK-NEXT: {{ $}}
96+
; CHECK-NEXT: bb.1:
97+
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
98+
; CHECK-NEXT: {{ $}}
99+
; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY2]], %bb.0, %4, %bb.2
100+
; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
101+
; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv
102+
; CHECK-NEXT: B %bb.2
103+
; CHECK-NEXT: {{ $}}
104+
; CHECK-NEXT: bb.2:
105+
; CHECK-NEXT: successors: %bb.1(0x80000000)
106+
; CHECK-NEXT: {{ $}}
107+
; CHECK-NEXT: $s0 = COPY [[COPY3]]
108+
; CHECK-NEXT: BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
109+
; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
110+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
111+
; CHECK-NEXT: B %bb.1
112+
; CHECK-NEXT: {{ $}}
113+
; CHECK-NEXT: bb.3:
114+
; CHECK-NEXT: RET_ReallyLR
115+
bb.0:
116+
liveins: $w0
117+
%1:gpr32 = COPY $w0
118+
%3:gpr32all = COPY $wzr
119+
%2:gpr32all = COPY %3:gpr32all
120+
121+
bb.1:
122+
%4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
123+
%6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
124+
Bcc 1, %bb.3, implicit $nzcv
125+
B %bb.2
126+
127+
bb.2:
128+
%7:fpr32 = COPY $wzr
129+
$s0 = COPY %7:fpr32
130+
BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
131+
%8:gpr32sp = ADDWri %4:gpr32common, 1, 0
132+
%5:gpr32all = COPY %8:gpr32sp
133+
B %bb.1
134+
135+
bb.3:
136+
RET_ReallyLR
137+
138+
...
139+
---
140+
name: fpr_to_gpr_virtual_copy_hoisted
141+
tracksRegLiveness: true
142+
body: |
143+
; CHECK-LABEL: name: fpr_to_gpr_virtual_copy_hoisted
144+
; CHECK: bb.0:
145+
; CHECK-NEXT: successors: %bb.1(0x80000000)
146+
; CHECK-NEXT: liveins: $w0, $s0
147+
; CHECK-NEXT: {{ $}}
148+
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
149+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr32 = COPY $s0
150+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr
151+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]]
152+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY1]]
153+
; CHECK-NEXT: {{ $}}
154+
; CHECK-NEXT: bb.1:
155+
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
156+
; CHECK-NEXT: {{ $}}
157+
; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2
158+
; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
159+
; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv
160+
; CHECK-NEXT: B %bb.2
161+
; CHECK-NEXT: {{ $}}
162+
; CHECK-NEXT: bb.2:
163+
; CHECK-NEXT: successors: %bb.1(0x80000000)
164+
; CHECK-NEXT: {{ $}}
165+
; CHECK-NEXT: $w0 = COPY [[COPY4]]
166+
; CHECK-NEXT: BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp
167+
; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
168+
; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
169+
; CHECK-NEXT: B %bb.1
170+
; CHECK-NEXT: {{ $}}
171+
; CHECK-NEXT: bb.3:
172+
; CHECK-NEXT: RET_ReallyLR
173+
bb.0:
174+
liveins: $w0, $s0
175+
%1:gpr32 = COPY $w0
176+
%0:fpr32 = COPY $s0
177+
%3:gpr32all = COPY $wzr
178+
%2:gpr32all = COPY %3:gpr32all
179+
180+
bb.1:
181+
%4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
182+
%6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
183+
Bcc 1, %bb.3, implicit $nzcv
184+
B %bb.2
185+
186+
bb.2:
187+
%7:gpr32 = COPY %0:fpr32
188+
$w0 = COPY %7:gpr32
189+
BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp
190+
%8:gpr32sp = ADDWri %4:gpr32common, 1, 0
191+
%5:gpr32all = COPY %8:gpr32sp
192+
B %bb.1
193+
194+
bb.3:
195+
RET_ReallyLR
196+
197+
...

0 commit comments

Comments
 (0)