Skip to content

Commit 3f257e5

Browse files
committed
[AArch64] Consider COPY between disjoint register classes as expensive
The motivation is to allow passes such as MachineLICM to hoist trivial FMOV instructions out of loops, where previously it didn't do so even when the RHS is a constant. On most architectures, these expensive move instructions have a latency of 2-6 cycles, and certainly not cheap as a 0-1 cycle move.
1 parent 94a7006 commit 3f257e5

File tree

2 files changed

+101
-0
lines changed

2 files changed

+101
-0
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,6 +1043,28 @@ static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
10431043
return Is.size() <= 2;
10441044
}
10451045

1046+
// Check if a COPY instruction is cheap.
1047+
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1048+
assert(MI.isCopy() && "Expected COPY instruction");
1049+
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1050+
1051+
// Cross-register-class copies (e.g., between GPR and FPR) are expensive on
1052+
// AArch64, typically requiring an FMOV instruction with a 2-6 cycle latency.
1053+
auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1054+
if (Reg.isVirtual())
1055+
return MRI.getRegClass(Reg);
1056+
if (Reg.isPhysical())
1057+
return RI.getMinimalPhysRegClass(Reg);
1058+
return nullptr;
1059+
};
1060+
const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1061+
const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1062+
if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1063+
return false;
1064+
1065+
return MI.isAsCheapAsAMove();
1066+
}
1067+
10461068
// FIXME: this implementation should be micro-architecture dependent, so a
10471069
// micro-architecture target hook should be introduced here in future.
10481070
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
@@ -1056,6 +1078,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
10561078
default:
10571079
return MI.isAsCheapAsAMove();
10581080

1081+
case TargetOpcode::COPY:
1082+
return isCheapCopy(MI, RI);
1083+
10591084
case AArch64::ADDWrs:
10601085
case AArch64::ADDXrs:
10611086
case AArch64::SUBWrs:
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=aarch64 -run-pass=early-machinelicm -o - %s | FileCheck %s
3+
4+
# This test verifies that cross-register-class copies (e.g., GPR to FPR)
5+
# are hoisted out of loops by MachineLICM, as they are expensive on AArch64.
6+
7+
--- |
8+
declare void @use_float(float)
9+
10+
define void @cross_regclass_copy_hoisted() {
11+
ret void
12+
}
13+
...
14+
---
15+
name: cross_regclass_copy_hoisted
16+
tracksRegLiveness: true
17+
body: |
18+
; CHECK-LABEL: name: cross_regclass_copy_hoisted
19+
; CHECK: bb.0:
20+
; CHECK-NEXT: successors: %bb.1(0x80000000)
21+
; CHECK-NEXT: liveins: $w0, $w1
22+
; CHECK-NEXT: {{ $}}
23+
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
24+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
25+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr
26+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]]
27+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[COPY1]]
28+
; CHECK-NEXT: {{ $}}
29+
; CHECK-NEXT: bb.1:
30+
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
31+
; CHECK-NEXT: {{ $}}
32+
; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2
33+
; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
34+
; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv
35+
; CHECK-NEXT: B %bb.2
36+
; CHECK-NEXT: {{ $}}
37+
; CHECK-NEXT: bb.2:
38+
; CHECK-NEXT: successors: %bb.1(0x80000000)
39+
; CHECK-NEXT: {{ $}}
40+
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
41+
; CHECK-NEXT: $s0 = COPY [[COPY4]]
42+
; CHECK-NEXT: BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
43+
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
44+
; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
45+
; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
46+
; CHECK-NEXT: B %bb.1
47+
; CHECK-NEXT: {{ $}}
48+
; CHECK-NEXT: bb.3:
49+
; CHECK-NEXT: RET_ReallyLR
50+
bb.0:
51+
liveins: $w0, $w1
52+
%1:gpr32 = COPY $w0
53+
%0:gpr32 = COPY $w1
54+
%3:gpr32all = COPY $wzr
55+
%2:gpr32all = COPY %3:gpr32all
56+
57+
bb.1:
58+
%4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
59+
%6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
60+
Bcc 1, %bb.3, implicit $nzcv
61+
B %bb.2
62+
63+
bb.2:
64+
%7:fpr32 = COPY %0:gpr32
65+
ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
66+
$s0 = COPY %7:fpr32
67+
BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
68+
ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
69+
%8:gpr32sp = ADDWri %4:gpr32common, 1, 0
70+
%5:gpr32all = COPY %8:gpr32sp
71+
B %bb.1
72+
73+
bb.3:
74+
RET_ReallyLR
75+
76+
...

0 commit comments

Comments
 (0)