-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AArch64][SME] Propagate desired ZA states in the MachineSMEABIPass #149510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
c2d3414 to
b3c2183
Compare
2c9e14c to
f182168
Compare
b3c2183 to
0497093
Compare
88c0bb6 to
2d5441c
Compare
6a6ae1f to
abfef3b
Compare
abfef3b to
e380fb8
Compare
0497093 to
d07322b
Compare
e380fb8 to
f2e838b
Compare
ceae882 to
ce36365
Compare
| ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr | ||
| ; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB11_6 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note: This regression is fixed by #149065
|
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis patch adds a step to the MachineSMEABIPass that propagates desired ZA states. This aims to pick better ZA states for edge bundles, as when many (or all) blocks in a bundle do not have a preferred ZA state, the ZA state assigned to a bundle can be less than ideal. An important case is nested loops, where only the inner loop has a preferred ZA state. Here we'd like to propagate the ZA state from the inner loop to the outer loops (to avoid saves/restores in any loop). Patch is 42.77 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149510.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 8d0ff41fc8c08..139684172f1bb 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -60,7 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
FunctionPass *createAArch64CollectLOHPass();
FunctionPass *createSMEABIPass();
FunctionPass *createSMEPeepholeOptPass();
-FunctionPass *createMachineSMEABIPass();
+FunctionPass *createMachineSMEABIPass(CodeGenOptLevel);
ModulePass *createSVEIntrinsicOptsPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 4650b2d0c8151..1dc2ec2d01b88 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -792,8 +792,8 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
}
void AArch64PassConfig::addMachineSSAOptimization() {
- if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None)
- addPass(createMachineSMEABIPass());
+ if (TM->getOptLevel() != CodeGenOptLevel::None && EnableNewSMEABILowering)
+ addPass(createMachineSMEABIPass(TM->getOptLevel()));
if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt)
addPass(createSMEPeepholeOptPass());
@@ -826,7 +826,7 @@ bool AArch64PassConfig::addILPOpts() {
void AArch64PassConfig::addPreRegAlloc() {
if (TM->getOptLevel() == CodeGenOptLevel::None && EnableNewSMEABILowering)
- addPass(createMachineSMEABIPass());
+ addPass(createMachineSMEABIPass(CodeGenOptLevel::None));
// Change dead register definitions to refer to the zero register.
if (TM->getOptLevel() != CodeGenOptLevel::None &&
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index c39a5cc2fcb16..25f23bc310681 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -176,7 +176,8 @@ getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
struct MachineSMEABI : public MachineFunctionPass {
inline static char ID = 0;
- MachineSMEABI() : MachineFunctionPass(ID) {}
+ MachineSMEABI(CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
+ : MachineFunctionPass(ID), OptLevel(OptLevel) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -202,6 +203,11 @@ struct MachineSMEABI : public MachineFunctionPass {
/// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA.
void insertStateChanges();
+ /// Propagates desired states forwards (from predecessors -> successors) if
+ /// \p Forwards, otherwise, propagates backwards (from successors ->
+ /// predecessors).
+ void propagateDesiredStates(bool Forwards = true);
+
// Emission routines for private and shared ZA functions (using lazy saves).
void emitNewZAPrologue(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI);
@@ -276,12 +282,16 @@ struct MachineSMEABI : public MachineFunctionPass {
/// Contains the needed ZA state for each instruction in a block.
/// Instructions that do not require a ZA state are not recorded.
struct BlockInfo {
- ZAState FixedEntryState{ZAState::ANY};
SmallVector<InstInfo> Insts;
+ ZAState FixedEntryState{ZAState::ANY};
+ ZAState DesiredIncomingState{ZAState::ANY};
+ ZAState DesiredOutgoingState{ZAState::ANY};
LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
};
+ CodeGenOptLevel OptLevel = CodeGenOptLevel::Default;
+
// All pass state that must be cleared between functions.
struct PassState {
SmallVector<BlockInfo> Blocks;
@@ -299,6 +309,7 @@ struct MachineSMEABI : public MachineFunctionPass {
const AArch64FunctionInfo *AFI = nullptr;
const TargetInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ MachineLoopInfo *MLI = nullptr;
};
void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
@@ -367,51 +378,105 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
// Reverse vector (as we had to iterate backwards for liveness).
std::reverse(Block.Insts.begin(), Block.Insts.end());
+
+ // Record the desired states on entry/exit of this block. These are the
+ // states that would not incur a state transition.
+ if (!Block.Insts.empty()) {
+ Block.DesiredIncomingState = Block.Insts.front().NeededState;
+ Block.DesiredOutgoingState = Block.Insts.back().NeededState;
+ }
+ }
+}
+
+void MachineSMEABI::propagateDesiredStates(bool Forwards) {
+ // If `Forwards`, this propagates desired states from predecessors to
+ // successors, otherwise, this propagates states from successors to
+ // predecessors.
+ auto GetBlockState = [](BlockInfo &Block, bool Incoming) -> ZAState & {
+ return Incoming ? Block.DesiredIncomingState : Block.DesiredOutgoingState;
+ };
+
+ SmallVector<MachineBasicBlock *> Worklist;
+ for (auto [BlockID, BlockInfo] : enumerate(State.Blocks)) {
+ if (!isLegalEdgeBundleZAState(GetBlockState(BlockInfo, Forwards)))
+ Worklist.push_back(MF->getBlockNumbered(BlockID));
+ }
+
+ while (!Worklist.empty()) {
+ MachineBasicBlock *MBB = Worklist.pop_back_val();
+ auto &BlockInfo = State.Blocks[MBB->getNumber()];
+
+ // Pick a legal edge bundle state that matches the majority of
+ // predecessors/successors.
+ int StateCounts[ZAState::NUM_ZA_STATE] = {0};
+ for (MachineBasicBlock *PredOrSucc :
+ Forwards ? predecessors(MBB) : successors(MBB)) {
+ auto &PredOrSuccBlockInfo = State.Blocks[PredOrSucc->getNumber()];
+ auto ZAState = GetBlockState(PredOrSuccBlockInfo, !Forwards);
+ if (isLegalEdgeBundleZAState(ZAState))
+ StateCounts[ZAState]++;
+ }
+
+ ZAState PropagatedState = ZAState(max_element(StateCounts) - StateCounts);
+ auto &CurrentState = GetBlockState(BlockInfo, Forwards);
+ if (PropagatedState != CurrentState) {
+ CurrentState = PropagatedState;
+ auto &OtherState = GetBlockState(BlockInfo, !Forwards);
+ // Propagate to the incoming/outgoing state if that is also "ANY".
+ if (OtherState == ZAState::ANY)
+ OtherState = PropagatedState;
+ // Push any successors/predecessors that may need updating to the
+ // worklist.
+ for (MachineBasicBlock *SuccOrPred :
+ Forwards ? successors(MBB) : predecessors(MBB)) {
+ auto &SuccOrPredBlockInfo = State.Blocks[SuccOrPred->getNumber()];
+ if (!isLegalEdgeBundleZAState(
+ GetBlockState(SuccOrPredBlockInfo, Forwards)))
+ Worklist.push_back(SuccOrPred);
+ }
+ }
}
}
void MachineSMEABI::assignBundleZAStates() {
State.BundleStates.resize(Bundles->getNumBundles());
+
for (unsigned I = 0, E = Bundles->getNumBundles(); I != E; ++I) {
LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n');
// Attempt to assign a ZA state for this bundle that minimizes state
// transitions. Edges within loops are given a higher weight as we assume
// they will be executed more than once.
- // TODO: We should propagate desired incoming/outgoing states through blocks
- // that have the "ANY" state first to make better global decisions.
int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
for (unsigned BlockID : Bundles->getBlocks(I)) {
LLVM_DEBUG(dbgs() << "- bb." << BlockID);
- const BlockInfo &Block = State.Blocks[BlockID];
- if (Block.Insts.empty()) {
- LLVM_DEBUG(dbgs() << " (no state preference)\n");
- continue;
- }
+ BlockInfo &Block = State.Blocks[BlockID];
bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I;
bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I;
- ZAState DesiredIncomingState = Block.Insts.front().NeededState;
- if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
- EdgeStateCounts[DesiredIncomingState]++;
+ bool LegalInEdge =
+ InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState);
+ bool LegalOutEgde =
+ OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState);
+ if (LegalInEdge) {
LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
- << getZAStateString(DesiredIncomingState));
+ << getZAStateString(Block.DesiredIncomingState));
+ EdgeStateCounts[Block.DesiredIncomingState]++;
}
- ZAState DesiredOutgoingState = Block.Insts.back().NeededState;
- if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
- EdgeStateCounts[DesiredOutgoingState]++;
+ if (LegalOutEgde) {
LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
- << getZAStateString(DesiredOutgoingState));
+ << getZAStateString(Block.DesiredOutgoingState));
+ EdgeStateCounts[Block.DesiredOutgoingState]++;
}
+ if (!LegalInEdge && !LegalOutEgde)
+ LLVM_DEBUG(dbgs() << " (no state preference)");
LLVM_DEBUG(dbgs() << '\n');
}
ZAState BundleState =
ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
- // Force ZA to be active in bundles that don't have a preferred state.
- // TODO: Something better here (to avoid extra mode switches).
if (BundleState == ZAState::ANY)
BundleState = ZAState::ACTIVE;
@@ -817,6 +882,10 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
collectNeededZAStates(SMEFnAttrs);
+ if (OptLevel != CodeGenOptLevel::None) {
+ for (bool Forwards : {true, false})
+ propagateDesiredStates(Forwards);
+ }
assignBundleZAStates();
insertStateChanges();
@@ -839,4 +908,6 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
return true;
}
-FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
+FunctionPass *llvm::createMachineSMEABIPass(CodeGenOptLevel OptLevel) {
+ return new MachineSMEABI(OptLevel);
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index a0a14f2ffae3f..077e9b5ced624 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -361,7 +361,6 @@ define i64 @test_many_callee_arguments(
ret i64 %ret
}
-; FIXME: The new lowering should avoid saves/restores in the probing loop.
define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{
; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
; CHECK: // %bb.0:
@@ -399,18 +398,14 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
; CHECK-NEWLOWERING-NEXT: mov x8, sp
; CHECK-NEWLOWERING-NEXT: sub x19, x8, x0
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
; CHECK-NEWLOWERING-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEWLOWERING-NEXT: cmp sp, x19
-; CHECK-NEWLOWERING-NEXT: mov x0, x19
-; CHECK-NEWLOWERING-NEXT: mrs x8, NZCV
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
-; CHECK-NEWLOWERING-NEXT: msr NZCV, x8
; CHECK-NEWLOWERING-NEXT: b.le .LBB7_3
; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
-; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
; CHECK-NEWLOWERING-NEXT: b .LBB7_1
; CHECK-NEWLOWERING-NEXT: .LBB7_3:
; CHECK-NEWLOWERING-NEXT: mov sp, x19
diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
index 18ea07e38fe89..c753e9c569d22 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
@@ -228,65 +228,34 @@ exit:
ret void
}
-; FIXME: The codegen for this case could be improved (by tuning weights).
-; Here the ZA save has been hoisted out of the conditional, but would be better
-; to sink it.
define void @cond_private_za_call(i1 %cond) "aarch64_inout_za" nounwind {
-; CHECK-LABEL: cond_private_za_call:
-; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEXT: tbz w0, #0, .LBB3_4
-; CHECK-NEXT: // %bb.1: // %private_za_call
-; CHECK-NEXT: sub x8, x29, #16
-; CHECK-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEXT: bl private_za_call
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB3_3
-; CHECK-NEXT: // %bb.2: // %private_za_call
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB3_3: // %private_za_call
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: .LBB3_4: // %exit
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT: b shared_za_call
-;
-; CHECK-NEWLOWERING-LABEL: cond_private_za_call:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB3_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %private_za_call
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: .LBB3_2: // %exit
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB3_4
-; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB3_4: // %exit
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: b shared_za_call
+; CHECK-COMMON-LABEL: cond_private_za_call:
+; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: mov x29, sp
+; CHECK-COMMON-NEXT: sub sp, sp, #16
+; CHECK-COMMON-NEXT: rdsvl x8, #1
+; CHECK-COMMON-NEXT: mov x9, sp
+; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
+; CHECK-COMMON-NEXT: mov sp, x9
+; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-COMMON-NEXT: tbz w0, #0, .LBB3_4
+; CHECK-COMMON-NEXT: // %bb.1: // %private_za_call
+; CHECK-COMMON-NEXT: sub x8, x29, #16
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8
+; CHECK-COMMON-NEXT: bl private_za_call
+; CHECK-COMMON-NEXT: smstart za
+; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-COMMON-NEXT: sub x0, x29, #16
+; CHECK-COMMON-NEXT: cbnz x8, .LBB3_3
+; CHECK-COMMON-NEXT: // %bb.2: // %private_za_call
+; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore
+; CHECK-COMMON-NEXT: .LBB3_3: // %private_za_call
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-COMMON-NEXT: .LBB3_4: // %exit
+; CHECK-COMMON-NEXT: mov sp, x29
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: b shared_za_call
br i1 %cond, label %private_za_call, label %exit
private_za_call:
@@ -910,7 +879,7 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin
; CHECK-NEWLOWERING-LABEL: loop_with_external_entry:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: mov x29, sp
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
@@ -923,23 +892,27 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %init
; CHECK-NEWLOWERING-NEXT: bl shared_za_call
; CHECK-NEWLOWERING-NEXT: .LBB11_2: // %loop.preheader
-; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16
+; CHECK-NEWLOWERING-NEXT: b .LBB11_4
; CHECK-NEWLOWERING-NEXT: .LBB11_3: // %loop
+; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1
+; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB11_6
+; CHECK-NEWLOWERING-NEXT: .LBB11_4: // %loop
; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: tbnz w19, #0, .LBB11_3
-; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
; CHECK-NEWLOWERING-NEXT: smstart za
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB11_6
-; CHECK-NEWLOWERING-NEXT: // %bb.5: // %exit
+; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB11_3
+; CHECK-NEWLOWERING-NEXT: // %bb.5: // %loop
+; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEWLOWERING-NEXT: b .LBB11_3
; CHECK-NEWLOWERING-NEXT: .LBB11_6: // %exit
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index bb88142efa592..506974a14c3be 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -56,31 +56,23 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe
; CHECK-NEXT: adrp x8, .L.str
; CHECK-NEXT: add x8, x8, :lo12:.L.str
; CHECK-NEXT: str x8, [x0]
-; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: .Ltmp0: // EH_LABEL
; CHECK-NEXT: adrp x1, :got:typeinfo_for_char_const_ptr
; CHECK-NEXT: mov x2, xzr
; CHECK-NEXT: ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr]
; CHECK-NEXT: bl __cxa_throw
-; CHECK-NEXT: .Ltmp1:
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB0_4
-; CHECK-NEXT: // %bb.3: // %throw_exception
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB0_4: // %throw_exception
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: // %bb.5: // %throw_fail
-; CHECK-NEXT: .LBB0_6: // %unwind_dtors
-; CHECK-NEXT: .Ltmp2:
+; CHECK-NEXT: .Ltmp1: // EH_LABEL
+; CHECK-NEXT: // %bb.3: // %throw_fail
+; CHECK-NEXT: .LBB0_4: // %unwind_dtors
+; CHECK-NEXT: .Ltmp2: // EH_LABEL
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB0_8
-; CHECK-NEXT: // %bb.7: // %unwind_dtors
+; CHECK-NEXT: cbnz x8, .LBB0_6
+; CHECK-NEXT: // %bb.5: // %unwind_dtors
; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB0_8: // %unwind_dtors
+; CHECK-NEXT: .LBB0_6: // %unwind_dtors
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: bl shared_za_call
; CHECK-NEXT: sub x8, x29, #16
@@ -142,11 +134,11 @@ define dso_local void @try_catch() "aarch64_...
[truncated]
|
ce36365 to
6266ecb
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not too familiar with ZA state changes, but in general the code itself looks good to me.
6266ecb to
6ec0649
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After learning more about ZA state changes I'm still happy with this 👍
Change-Id: Idef5b1e2a45585f97897fc11c4f237996edb7c8b
This patch adds a propagation step to the MachineSMEABIPass that propagates desired ZA states forwards/backwards (from predecessors to successors, or vice versa). The aim of this is to pick better ZA states for edge bundles, as when many (or all) blocks in a bundle do not have a preferred ZA state, the ZA state assigned to a bundle can be less than ideal. An important case is nested loops, where only the inner loop has a preferred ZA state. Here we'd like to propagate the ZA state up from the inner loop to the outer loops (to avoid saves/restores in any loop). Change-Id: I39f9c7d7608e2fa070be2fb88351b4d1d0079041
6ec0649 to
3ade12c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with nits addressed
Change-Id: Ia73994aecb3d917b5cd354b4ebb91c228c4a5d7c
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/11/builds/26866 Here is the relevant piece of the build log for the reference |
…lvm#149510) This patch adds a step to the MachineSMEABIPass that propagates desired ZA states. This aims to pick better ZA states for edge bundles, as when many (or all) blocks in a bundle do not have a preferred ZA state, the ZA state assigned to a bundle can be less than ideal. An important case is nested loops, where only the inner loop has a preferred ZA state. Here we'd like to propagate the ZA state from the inner loop to the outer loops (to avoid saves/restores in any loop).
…lvm#149510) This patch adds a step to the MachineSMEABIPass that propagates desired ZA states. This aims to pick better ZA states for edge bundles, as when many (or all) blocks in a bundle do not have a preferred ZA state, the ZA state assigned to a bundle can be less than ideal. An important case is nested loops, where only the inner loop has a preferred ZA state. Here we'd like to propagate the ZA state from the inner loop to the outer loops (to avoid saves/restores in any loop).
This patch adds a step to the MachineSMEABIPass that propagates desired ZA states.
This aims to pick better ZA states for edge bundles, as when many (or all) blocks in a bundle do not have a preferred ZA state, the ZA state assigned to a bundle can be less than ideal.
An important case is nested loops, where only the inner loop has a preferred ZA state. Here we'd like to propagate the ZA state from the inner loop to the outer loops (to avoid saves/restores in any loop).