Skip to content

Commit 72b8956

Browse files
committed
[AArch64][SME] Avoid ZA save state changes in loops in MachineSMEABIPass
This patch uses the MachineLoopInfo to give blocks within loops a higher weight when choosing the bundle ZA state. MachineLoopInfo does not find loop trip counts, so this uses an arbitrary weight (default 10), which can be configured with the `-aarch64-sme-abi-loop-edge-weight` flag. This makes the MachineSMEABIPass pass more likely to pick a bundle state that matches the loop's entry/exit state, which avoids state changes in the loop (which we assume will happen more than once). This does require some extra analysis, so this is only enabled at -O1 and above. Change-Id: If318c809d2f7cc1fca144fbe424ba2a2ca7fb19f
1 parent 57d4c90 commit 72b8956

File tree

3 files changed

+158
-35
lines changed

3 files changed

+158
-35
lines changed

llvm/lib/Target/AArch64/MachineSMEABIPass.cpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,20 @@
6363
#include "llvm/CodeGen/LivePhysRegs.h"
6464
#include "llvm/CodeGen/MachineBasicBlock.h"
6565
#include "llvm/CodeGen/MachineFunctionPass.h"
66+
#include "llvm/CodeGen/MachineLoopInfo.h"
6667
#include "llvm/CodeGen/MachineRegisterInfo.h"
6768
#include "llvm/CodeGen/TargetRegisterInfo.h"
6869

6970
using namespace llvm;
7071

7172
#define DEBUG_TYPE "aarch64-machine-sme-abi"
7273

74+
static cl::opt<int>
75+
LoopEdgeWeight("aarch64-sme-abi-loop-edge-weight", cl::ReallyHidden,
76+
cl::init(10),
77+
cl::desc("Edge weight for basic blocks witin loops (used "
78+
"for placing ZA saves/restores)"));
79+
7380
namespace {
7481

7582
enum ZAState {
@@ -255,6 +262,9 @@ struct MachineSMEABI : public MachineFunctionPass {
255262
void getAnalysisUsage(AnalysisUsage &AU) const override {
256263
AU.setPreservesCFG();
257264
AU.addRequired<EdgeBundlesWrapperLegacy>();
265+
// Only analyse loops at -01 and above.
266+
if (OptLevel != CodeGenOptLevel::None)
267+
AU.addRequired<MachineLoopInfoWrapperPass>();
258268
AU.addPreservedID(MachineLoopInfoID);
259269
AU.addPreservedID(MachineDominatorsID);
260270
MachineFunctionPass::getAnalysisUsage(AU);
@@ -516,24 +526,31 @@ MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles,
516526
int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
517527
for (unsigned BlockID : Bundles.getBlocks(I)) {
518528
LLVM_DEBUG(dbgs() << "- bb." << BlockID);
519-
520529
const BlockInfo &Block = FnInfo.Blocks[BlockID];
530+
bool IsLoop = MLI && MLI->getLoopFor(MF->getBlockNumbered(BlockID));
521531
bool InEdge = Bundles.getBundle(BlockID, /*Out=*/false) == I;
522532
bool OutEdge = Bundles.getBundle(BlockID, /*Out=*/true) == I;
523533

534+
// TODO: Use MachineBranchProbabilityInfo for edge weights?
535+
int EdgeWeight = IsLoop ? LoopEdgeWeight : 1;
536+
if (IsLoop)
537+
LLVM_DEBUG(dbgs() << " IsLoop");
538+
524539
bool LegalInEdge =
525540
InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState);
526541
bool LegalOutEgde =
527542
OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState);
543+
544+
LLVM_DEBUG(dbgs() << " (EdgeWeight: " << EdgeWeight << ')');
528545
if (LegalInEdge) {
529546
LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
530547
<< getZAStateString(Block.DesiredIncomingState));
531-
EdgeStateCounts[Block.DesiredIncomingState]++;
548+
EdgeStateCounts[Block.DesiredIncomingState] += EdgeWeight;
532549
}
533550
if (LegalOutEgde) {
534551
LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
535552
<< getZAStateString(Block.DesiredOutgoingState));
536-
EdgeStateCounts[Block.DesiredOutgoingState]++;
553+
EdgeStateCounts[Block.DesiredOutgoingState] += EdgeWeight;
537554
}
538555
if (!LegalInEdge && !LegalOutEgde)
539556
LLVM_DEBUG(dbgs() << " (no state preference)");
@@ -982,6 +999,8 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
982999
TII = Subtarget->getInstrInfo();
9831000
TRI = Subtarget->getRegisterInfo();
9841001
MRI = &MF.getRegInfo();
1002+
if (OptLevel != CodeGenOptLevel::None)
1003+
MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
9851004

9861005
const EdgeBundles &Bundles =
9871006
getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O0
3+
; RUN: llc -O1 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O1
4+
5+
declare void @private_za_call()
6+
declare void @shared_za_call() "aarch64_inout_za"
7+
8+
; This test checks that at -O0 we don't attempt to optimize lazy save state
9+
; changes in loops, and that -O1 (and above) we attempt to push state changes
10+
; out of loops.
11+
12+
define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
13+
; CHECK-O0-LABEL: private_za_loop_active_entry_and_exit:
14+
; CHECK-O0: // %bb.0: // %entry
15+
; CHECK-O0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
16+
; CHECK-O0-NEXT: mov x29, sp
17+
; CHECK-O0-NEXT: sub sp, sp, #32
18+
; CHECK-O0-NEXT: rdsvl x9, #1
19+
; CHECK-O0-NEXT: mov x8, sp
20+
; CHECK-O0-NEXT: msub x8, x9, x9, x8
21+
; CHECK-O0-NEXT: mov sp, x8
22+
; CHECK-O0-NEXT: stp x8, x9, [x29, #-16]
23+
; CHECK-O0-NEXT: stur w0, [x29, #-24] // 4-byte Folded Spill
24+
; CHECK-O0-NEXT: bl shared_za_call
25+
; CHECK-O0-NEXT: ldur w0, [x29, #-24] // 4-byte Folded Reload
26+
; CHECK-O0-NEXT: mov w8, wzr
27+
; CHECK-O0-NEXT: subs w9, w0, #1
28+
; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill
29+
; CHECK-O0-NEXT: b.lt .LBB0_4
30+
; CHECK-O0-NEXT: b .LBB0_1
31+
; CHECK-O0-NEXT: .LBB0_1: // %loop
32+
; CHECK-O0-NEXT: // =>This Inner Loop Header: Depth=1
33+
; CHECK-O0-NEXT: ldur w8, [x29, #-20] // 4-byte Folded Reload
34+
; CHECK-O0-NEXT: stur w8, [x29, #-28] // 4-byte Folded Spill
35+
; CHECK-O0-NEXT: sub x8, x29, #16
36+
; CHECK-O0-NEXT: msr TPIDR2_EL0, x8
37+
; CHECK-O0-NEXT: bl private_za_call
38+
; CHECK-O0-NEXT: ldur w8, [x29, #-28] // 4-byte Folded Reload
39+
; CHECK-O0-NEXT: ldur w10, [x29, #-24] // 4-byte Folded Reload
40+
; CHECK-O0-NEXT: add w9, w8, #1
41+
; CHECK-O0-NEXT: mov w8, w9
42+
; CHECK-O0-NEXT: smstart za
43+
; CHECK-O0-NEXT: mrs x11, TPIDR2_EL0
44+
; CHECK-O0-NEXT: sub x0, x29, #16
45+
; CHECK-O0-NEXT: cbz x11, .LBB0_2
46+
; CHECK-O0-NEXT: b .LBB0_3
47+
; CHECK-O0-NEXT: .LBB0_2: // %loop
48+
; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1
49+
; CHECK-O0-NEXT: bl __arm_tpidr2_restore
50+
; CHECK-O0-NEXT: b .LBB0_3
51+
; CHECK-O0-NEXT: .LBB0_3: // %loop
52+
; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1
53+
; CHECK-O0-NEXT: msr TPIDR2_EL0, xzr
54+
; CHECK-O0-NEXT: subs w9, w9, w10
55+
; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill
56+
; CHECK-O0-NEXT: b.ne .LBB0_1
57+
; CHECK-O0-NEXT: b .LBB0_4
58+
; CHECK-O0-NEXT: .LBB0_4: // %exit
59+
; CHECK-O0-NEXT: mov sp, x29
60+
; CHECK-O0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
61+
; CHECK-O0-NEXT: b shared_za_call
62+
;
63+
; CHECK-O1-LABEL: private_za_loop_active_entry_and_exit:
64+
; CHECK-O1: // %bb.0: // %entry
65+
; CHECK-O1-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
66+
; CHECK-O1-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
67+
; CHECK-O1-NEXT: mov x29, sp
68+
; CHECK-O1-NEXT: sub sp, sp, #16
69+
; CHECK-O1-NEXT: rdsvl x8, #1
70+
; CHECK-O1-NEXT: mov x9, sp
71+
; CHECK-O1-NEXT: msub x9, x8, x8, x9
72+
; CHECK-O1-NEXT: mov sp, x9
73+
; CHECK-O1-NEXT: mov w19, w0
74+
; CHECK-O1-NEXT: stp x9, x8, [x29, #-16]
75+
; CHECK-O1-NEXT: bl shared_za_call
76+
; CHECK-O1-NEXT: sub x8, x29, #16
77+
; CHECK-O1-NEXT: cmp w19, #1
78+
; CHECK-O1-NEXT: msr TPIDR2_EL0, x8
79+
; CHECK-O1-NEXT: b.lt .LBB0_2
80+
; CHECK-O1-NEXT: .LBB0_1: // %loop
81+
; CHECK-O1-NEXT: // =>This Inner Loop Header: Depth=1
82+
; CHECK-O1-NEXT: bl private_za_call
83+
; CHECK-O1-NEXT: subs w19, w19, #1
84+
; CHECK-O1-NEXT: b.ne .LBB0_1
85+
; CHECK-O1-NEXT: .LBB0_2: // %exit
86+
; CHECK-O1-NEXT: smstart za
87+
; CHECK-O1-NEXT: mrs x8, TPIDR2_EL0
88+
; CHECK-O1-NEXT: sub x0, x29, #16
89+
; CHECK-O1-NEXT: cbnz x8, .LBB0_4
90+
; CHECK-O1-NEXT: // %bb.3: // %exit
91+
; CHECK-O1-NEXT: bl __arm_tpidr2_restore
92+
; CHECK-O1-NEXT: .LBB0_4: // %exit
93+
; CHECK-O1-NEXT: msr TPIDR2_EL0, xzr
94+
; CHECK-O1-NEXT: mov sp, x29
95+
; CHECK-O1-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
96+
; CHECK-O1-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
97+
; CHECK-O1-NEXT: b shared_za_call
98+
entry:
99+
%cmpgt = icmp sgt i32 %n, 0
100+
tail call void @shared_za_call()
101+
br i1 %cmpgt, label %loop, label %exit
102+
103+
loop:
104+
%iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ]
105+
tail call void @private_za_call()
106+
%next_iv = add nuw nsw i32 %iv, 1
107+
%cmpeq = icmp eq i32 %next_iv, %n
108+
br i1 %cmpeq, label %exit, label %loop
109+
110+
exit:
111+
tail call void @shared_za_call()
112+
ret void
113+
}

llvm/test/CodeGen/AArch64/sme-za-control-flow.ll

Lines changed: 23 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ exit:
9696
ret void
9797
}
9898

99-
; FIXME: In the new lowering we could weight edges to avoid doing the lazy save in the loop.
99+
; This tests that with the new lowering we push state changes out of loops (at -O1 and above).
100100
define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
101101
; CHECK-LABEL: private_za_loop_active_entry_and_exit:
102102
; CHECK: // %bb.0: // %entry
@@ -142,7 +142,7 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
142142
; CHECK-NEWLOWERING-LABEL: private_za_loop_active_entry_and_exit:
143143
; CHECK-NEWLOWERING: // %bb.0: // %entry
144144
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
145-
; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
145+
; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
146146
; CHECK-NEWLOWERING-NEXT: mov x29, sp
147147
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
148148
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
@@ -152,31 +152,26 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
152152
; CHECK-NEWLOWERING-NEXT: mov w19, w0
153153
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
154154
; CHECK-NEWLOWERING-NEXT: bl shared_za_call
155+
; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
155156
; CHECK-NEWLOWERING-NEXT: cmp w19, #1
156-
; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_5
157-
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader
158-
; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16
159-
; CHECK-NEWLOWERING-NEXT: b .LBB1_3
160-
; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %loop
161-
; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1
162-
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
163-
; CHECK-NEWLOWERING-NEXT: cbz w19, .LBB1_5
164-
; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %loop
157+
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
158+
; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_2
159+
; CHECK-NEWLOWERING-NEXT: .LBB1_1: // %loop
165160
; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1
166-
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20
167161
; CHECK-NEWLOWERING-NEXT: bl private_za_call
168-
; CHECK-NEWLOWERING-NEXT: sub w19, w19, #1
162+
; CHECK-NEWLOWERING-NEXT: subs w19, w19, #1
163+
; CHECK-NEWLOWERING-NEXT: b.ne .LBB1_1
164+
; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %exit
169165
; CHECK-NEWLOWERING-NEXT: smstart za
170166
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
171167
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
172-
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2
173-
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %loop
174-
; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1
168+
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_4
169+
; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit
175170
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
176-
; CHECK-NEWLOWERING-NEXT: b .LBB1_2
177-
; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit
171+
; CHECK-NEWLOWERING-NEXT: .LBB1_4: // %exit
172+
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
178173
; CHECK-NEWLOWERING-NEXT: mov sp, x29
179-
; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
174+
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
180175
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
181176
; CHECK-NEWLOWERING-NEXT: b shared_za_call
182177
entry:
@@ -879,7 +874,7 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin
879874
; CHECK-NEWLOWERING-LABEL: loop_with_external_entry:
880875
; CHECK-NEWLOWERING: // %bb.0: // %entry
881876
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
882-
; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
877+
; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
883878
; CHECK-NEWLOWERING-NEXT: mov x29, sp
884879
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
885880
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
@@ -892,27 +887,23 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin
892887
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %init
893888
; CHECK-NEWLOWERING-NEXT: bl shared_za_call
894889
; CHECK-NEWLOWERING-NEXT: .LBB11_2: // %loop.preheader
895-
; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16
896-
; CHECK-NEWLOWERING-NEXT: b .LBB11_4
890+
; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
891+
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
897892
; CHECK-NEWLOWERING-NEXT: .LBB11_3: // %loop
898-
; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1
899-
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
900-
; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB11_6
901-
; CHECK-NEWLOWERING-NEXT: .LBB11_4: // %loop
902893
; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1
903-
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20
904894
; CHECK-NEWLOWERING-NEXT: bl private_za_call
895+
; CHECK-NEWLOWERING-NEXT: tbnz w19, #0, .LBB11_3
896+
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
905897
; CHECK-NEWLOWERING-NEXT: smstart za
906898
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
907899
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
908-
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB11_3
909-
; CHECK-NEWLOWERING-NEXT: // %bb.5: // %loop
910-
; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1
900+
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB11_6
901+
; CHECK-NEWLOWERING-NEXT: // %bb.5: // %exit
911902
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
912-
; CHECK-NEWLOWERING-NEXT: b .LBB11_3
913903
; CHECK-NEWLOWERING-NEXT: .LBB11_6: // %exit
904+
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
914905
; CHECK-NEWLOWERING-NEXT: mov sp, x29
915-
; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
906+
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
916907
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
917908
; CHECK-NEWLOWERING-NEXT: ret
918909
entry:

0 commit comments

Comments
 (0)