Skip to content

Commit ededcb0

Browse files
[AArch64] Refactor AArch64InstrInfo::isAsCheapAsAMove (NFC)
- remove `FeatureCustomCheapAsMoveHandling`: when you have target features affecting `isAsCheapAsAMove` that can be given on command line or passed via attributes, then every sub-target effectively has custom handling - remove special handling of `FMOVD0`/etc: `FVMOV` with an immediate zero operand is never[1] more expensive tha an `FMOV` with a register operand. - remove special handling of `COPY` - copy is trivially as cheap as itself - make the function default to the `MachineInstr` attribute `isAsCheapAsAMove` - remove special handling of `ANDWrr`/etc and of `ANDWri`/etc: the fallback `MachineInstr` attribute is already non-zero. - remove special handling of `ADDWri`/`SUBWri`/`ADDXri`/`SUBXri` - there are always[1] one cycle latency with maximum (for the micro-architecture) throughput - check if `MOVi32Imm`/`MOVi64Imm` can be expanded into a "cheap" sequence of instructions There is a little twist with determining whether a MOVi32Imm`/`MOVi64Imm` is "as-cheap-as-a-move". Even if one of these pseudo-instructions needs to be expanded to more than one MOVZ, MOVN, or MOVK instructions, materialisation may be preferrable to allocating a register to hold the constant. For the moment a cutoff at two instructions seems like a reasonable compromise. [1] according to 19 software optimisation manuals Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D154722
1 parent b4301df commit ededcb0

File tree

3 files changed

+136
-83
lines changed

3 files changed

+136
-83
lines changed

llvm/lib/Target/AArch64/AArch64.td

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -223,14 +223,9 @@ def FeatureEnableSelectOptimize : SubtargetFeature<
223223
"enable-select-opt", "EnableSelectOptimize", "true",
224224
"Enable the select optimize pass for select loop heuristics">;
225225

226-
def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
227-
"HasCustomCheapAsMoveHandling", "true",
228-
"Use custom handling of cheap instructions">;
229-
230226
def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move",
231227
"HasExynosCheapAsMoveHandling", "true",
232-
"Use Exynos specific handling of cheap instructions",
233-
[FeatureCustomCheapAsMoveHandling]>;
228+
"Use Exynos specific handling of cheap instructions">;
234229

235230
def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
236231
"UsePostRAScheduler", "true", "Schedule again after register allocation">;
@@ -794,7 +789,6 @@ def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
794789
FeatureFuseAES,
795790
FeatureFuseAdrpAdd,
796791
FeatureBalanceFPOps,
797-
FeatureCustomCheapAsMoveHandling,
798792
FeaturePostRAScheduler]>;
799793

800794
def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
@@ -815,7 +809,6 @@ def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
815809
"Cortex-A57 ARM processors", [
816810
FeatureFuseAES,
817811
FeatureBalanceFPOps,
818-
FeatureCustomCheapAsMoveHandling,
819812
FeatureFuseAdrpAdd,
820813
FeatureFuseLiterals,
821814
FeaturePostRAScheduler,
@@ -1110,7 +1103,6 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
11101103

11111104
def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
11121105
"Qualcomm Kryo processors", [
1113-
FeatureCustomCheapAsMoveHandling,
11141106
FeaturePostRAScheduler,
11151107
FeaturePredictableSelectIsExpensive,
11161108
FeatureZCZeroing,
@@ -1120,7 +1112,6 @@ def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
11201112

11211113
def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
11221114
"Qualcomm Falkor processors", [
1123-
FeatureCustomCheapAsMoveHandling,
11241115
FeaturePostRAScheduler,
11251116
FeaturePredictableSelectIsExpensive,
11261117
FeatureZCZeroing,
@@ -1188,7 +1179,6 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
11881179

11891180
def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
11901181
"Qualcomm Saphira processors", [
1191-
FeatureCustomCheapAsMoveHandling,
11921182
FeaturePostRAScheduler,
11931183
FeaturePredictableSelectIsExpensive,
11941184
FeatureZCZeroing,
@@ -1237,7 +1227,6 @@ def TuneThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
12371227

12381228
def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
12391229
"HiSilicon TS-V110 processors", [
1240-
FeatureCustomCheapAsMoveHandling,
12411230
FeatureFuseAES,
12421231
FeaturePostRAScheduler]>;
12431232

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 18 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -878,93 +878,40 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
878878
.addImm(CC);
879879
}
880880

881-
/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
882-
static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
883-
uint64_t Imm = MI.getOperand(1).getImm();
884-
uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
885-
uint64_t Encoding;
886-
return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
881+
// Return true if Imm can be loaded into a register by a "cheap" sequence of
882+
// instructions. For now, "cheap" means at most two instructions.
883+
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
884+
if (BitSize == 32)
885+
return true;
886+
887+
assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
888+
uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
889+
SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
890+
AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
891+
892+
return Is.size() <= 2;
887893
}
888894

889895
// FIXME: this implementation should be micro-architecture dependent, so a
890896
// micro-architecture target hook should be introduced here in future.
891897
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
892-
if (!Subtarget.hasCustomCheapAsMoveHandling())
893-
return MI.isAsCheapAsAMove();
894-
895-
const unsigned Opcode = MI.getOpcode();
896-
897-
// Firstly, check cases gated by features.
898-
899-
if (Subtarget.hasZeroCycleZeroingFP()) {
900-
if (Opcode == AArch64::FMOVH0 ||
901-
Opcode == AArch64::FMOVS0 ||
902-
Opcode == AArch64::FMOVD0)
903-
return true;
904-
}
905-
906-
if (Subtarget.hasZeroCycleZeroingGP()) {
907-
if (Opcode == TargetOpcode::COPY &&
908-
(MI.getOperand(1).getReg() == AArch64::WZR ||
909-
MI.getOperand(1).getReg() == AArch64::XZR))
910-
return true;
911-
}
912-
913-
// Secondly, check cases specific to sub-targets.
914-
915898
if (Subtarget.hasExynosCheapAsMoveHandling()) {
916899
if (isExynosCheapAsMove(MI))
917900
return true;
918-
919901
return MI.isAsCheapAsAMove();
920902
}
921903

922-
// Finally, check generic cases.
923-
924-
switch (Opcode) {
904+
switch (MI.getOpcode()) {
925905
default:
926-
return false;
927-
928-
// add/sub on register without shift
929-
case AArch64::ADDWri:
930-
case AArch64::ADDXri:
931-
case AArch64::SUBWri:
932-
case AArch64::SUBXri:
933-
return (MI.getOperand(3).getImm() == 0);
934-
935-
// logical ops on immediate
936-
case AArch64::ANDWri:
937-
case AArch64::ANDXri:
938-
case AArch64::EORWri:
939-
case AArch64::EORXri:
940-
case AArch64::ORRWri:
941-
case AArch64::ORRXri:
942-
return true;
943-
944-
// logical ops on register without shift
945-
case AArch64::ANDWrr:
946-
case AArch64::ANDXrr:
947-
case AArch64::BICWrr:
948-
case AArch64::BICXrr:
949-
case AArch64::EONWrr:
950-
case AArch64::EONXrr:
951-
case AArch64::EORWrr:
952-
case AArch64::EORXrr:
953-
case AArch64::ORNWrr:
954-
case AArch64::ORNXrr:
955-
case AArch64::ORRWrr:
956-
case AArch64::ORRXrr:
957-
return true;
958-
906+
return MI.isAsCheapAsAMove();
959907
// If MOVi32imm or MOVi64imm can be expanded into ORRWri or
960-
// ORRXri, it is as cheap as MOV
908+
// ORRXri, it is as cheap as MOV.
909+
// Likewise if it can be expanded to MOVZ/MOVN/MOVK.
961910
case AArch64::MOVi32imm:
962-
return canBeExpandedToORR(MI, 32);
911+
return isCheapImmediate(MI, 32);
963912
case AArch64::MOVi64imm:
964-
return canBeExpandedToORR(MI, 64);
913+
return isCheapImmediate(MI, 64);
965914
}
966-
967-
llvm_unreachable("Unknown opcode to check as cheap as a move!");
968915
}
969916

970917
bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc < %s | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux"
5+
6+
; Check an "expensive" construction of a constant is hoisted out of a loop
7+
define void @f0(ptr %a, i64 %n) {
8+
; CHECK-LABEL: f0:
9+
; CHECK: // %bb.0: // %entry
10+
; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
11+
; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
12+
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
13+
; CHECK-NEXT: .cfi_def_cfa_offset 48
14+
; CHECK-NEXT: .cfi_offset w19, -8
15+
; CHECK-NEXT: .cfi_offset w20, -16
16+
; CHECK-NEXT: .cfi_offset w21, -24
17+
; CHECK-NEXT: .cfi_offset w22, -32
18+
; CHECK-NEXT: .cfi_offset w23, -40
19+
; CHECK-NEXT: .cfi_offset w30, -48
20+
; CHECK-NEXT: mov x21, #1 // =0x1
21+
; CHECK-NEXT: mov x19, x1
22+
; CHECK-NEXT: mov x20, x0
23+
; CHECK-NEXT: movk x21, #22136, lsl #16
24+
; CHECK-NEXT: mov x22, xzr
25+
; CHECK-NEXT: movk x21, #4660, lsl #48
26+
; CHECK-NEXT: cmp x22, x19
27+
; CHECK-NEXT: b.ge .LBB0_2
28+
; CHECK-NEXT: .LBB0_1: // %loop.body
29+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
30+
; CHECK-NEXT: lsl x23, x22, #2
31+
; CHECK-NEXT: mov x1, x21
32+
; CHECK-NEXT: ldr w0, [x20, x23]
33+
; CHECK-NEXT: bl g
34+
; CHECK-NEXT: str w0, [x20, x23]
35+
; CHECK-NEXT: add x22, x22, #1
36+
; CHECK-NEXT: cmp x22, x19
37+
; CHECK-NEXT: b.lt .LBB0_1
38+
; CHECK-NEXT: .LBB0_2: // %exit
39+
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
40+
; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
41+
; CHECK-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload
42+
; CHECK-NEXT: ret
43+
entry:
44+
br label %loop
45+
46+
loop:
47+
%i = phi i64 [0, %entry], [%i.next, %loop.body]
48+
%c = icmp slt i64 %i, %n
49+
br i1 %c, label %loop.body, label %exit
50+
51+
loop.body:
52+
%p = getelementptr i32, ptr %a, i64 %i
53+
%v = load i32, ptr %p
54+
%w = call i32 @g(i32 %v, i64 1311673392922361857) ; 0x1234000056780001
55+
store i32 %w, ptr %p
56+
%i.next = add i64 %i, 1
57+
br label %loop
58+
59+
exit:
60+
ret void
61+
}
62+
63+
; Check a "cheap" to construct constant is materialised inside a loop.
64+
define void @f1(ptr %a, i64 %n) {
65+
; CHECK-LABEL: f1:
66+
; CHECK: // %bb.0: // %entry
67+
; CHECK-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
68+
; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
69+
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
70+
; CHECK-NEXT: .cfi_def_cfa_offset 48
71+
; CHECK-NEXT: .cfi_offset w19, -8
72+
; CHECK-NEXT: .cfi_offset w20, -16
73+
; CHECK-NEXT: .cfi_offset w21, -24
74+
; CHECK-NEXT: .cfi_offset w22, -32
75+
; CHECK-NEXT: .cfi_offset w30, -48
76+
; CHECK-NEXT: mov x19, x1
77+
; CHECK-NEXT: mov x20, x0
78+
; CHECK-NEXT: mov x21, xzr
79+
; CHECK-NEXT: cmp x21, x19
80+
; CHECK-NEXT: b.ge .LBB1_2
81+
; CHECK-NEXT: .LBB1_1: // %loop.body
82+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
83+
; CHECK-NEXT: lsl x22, x21, #2
84+
; CHECK-NEXT: mov x1, #1450704896 // =0x56780000
85+
; CHECK-NEXT: movk x1, #4660, lsl #48
86+
; CHECK-NEXT: ldr w0, [x20, x22]
87+
; CHECK-NEXT: bl g
88+
; CHECK-NEXT: str w0, [x20, x22]
89+
; CHECK-NEXT: add x21, x21, #1
90+
; CHECK-NEXT: cmp x21, x19
91+
; CHECK-NEXT: b.lt .LBB1_1
92+
; CHECK-NEXT: .LBB1_2: // %exit
93+
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
94+
; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
95+
; CHECK-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
96+
; CHECK-NEXT: ret
97+
entry:
98+
br label %loop
99+
100+
loop:
101+
%i = phi i64 [0, %entry], [%i.next, %loop.body]
102+
%c = icmp slt i64 %i, %n
103+
br i1 %c, label %loop.body, label %exit
104+
105+
loop.body:
106+
%p = getelementptr i32, ptr %a, i64 %i
107+
%v = load i32, ptr %p
108+
%w = call i32 @g(i32 %v, i64 1311673392922361856) ; 0x1234000056780000
109+
store i32 %w, ptr %p
110+
%i.next = add i64 %i, 1
111+
br label %loop
112+
113+
exit:
114+
ret void
115+
}
116+
117+
declare i32 @g(i32, i64)

0 commit comments

Comments
 (0)