Skip to content

Conversation

@MacDue
Copy link
Member

@MacDue MacDue commented Jul 16, 2025

This patch uses the MachineLoopInfo to give blocks within loops a higher weight when choosing the bundle ZA state. MachineLoopInfo does not find loop trip counts, so this uses an arbitrary weight (default 10), which can be configured with the -aarch64-sme-abi-loop-edge-weight flag.

This makes the MachineSMEABIPass pass more likely to pick a bundle state that matches the loop's entry/exit state, which avoids state changes in the loop (which we assume will happen more than once).

This does require some extra analysis, so this is only enabled at -O1 and above.

@MacDue MacDue marked this pull request as ready for review July 16, 2025 10:45
@llvmbot
Copy link
Member

llvmbot commented Jul 16, 2025

@llvm/pr-subscribers-backend-aarch64

Author: Benjamin Maxwell (MacDue)

Changes

This patch uses the MachineLoopInfo to give blocks within loops a higher weight when choosing the bundle ZA state. MachineLoopInfo does not find loop trip counts, so this uses an arbitrary weight (default 10), which can be configured with the -aarch64-sme-abi-loop-edge-weight flag.

This makes the MachineSMEABIPass pass more likely to pick a bundle state that matches the loop's entry/exit state, which avoids state changes in the loop (which we assume will happen more than once).

This does require some extra analysis, so this is only enabled at -O1 and above.


Full diff: https://github.com/llvm/llvm-project/pull/149065.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64.h (+1-1)
  • (modified) llvm/lib/Target/AArch64/AArch64TargetMachine.cpp (+2-2)
  • (modified) llvm/lib/Target/AArch64/MachineSMEABIPass.cpp (+27-4)
  • (added) llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll (+115)
  • (modified) llvm/test/CodeGen/AArch64/sme-za-control-flow.ll (+14-19)
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 8d0ff41fc8c08..139684172f1bb 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -60,7 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 FunctionPass *createAArch64CollectLOHPass();
 FunctionPass *createSMEABIPass();
 FunctionPass *createSMEPeepholeOptPass();
-FunctionPass *createMachineSMEABIPass();
+FunctionPass *createMachineSMEABIPass(CodeGenOptLevel);
 ModulePass *createSVEIntrinsicOptsPass();
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 2c1edecd0b48d..b26a137d4e0fb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -792,7 +792,7 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
 
 void AArch64PassConfig::addMachineSSAOptimization() {
   if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None)
-    addPass(createMachineSMEABIPass());
+    addPass(createMachineSMEABIPass(TM->getOptLevel()));
 
   if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt)
     addPass(createSMEPeepholeOptPass());
@@ -825,7 +825,7 @@ bool AArch64PassConfig::addILPOpts() {
 
 void AArch64PassConfig::addPreRegAlloc() {
   if (EnableNewSMEABILowering && TM->getOptLevel() == CodeGenOptLevel::None)
-    addPass(createMachineSMEABIPass());
+    addPass(createMachineSMEABIPass(CodeGenOptLevel::None));
 
   // Change dead register definitions to refer to the zero register.
   if (TM->getOptLevel() != CodeGenOptLevel::None &&
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 7c0cad299cc64..f63a338b4bd23 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 
@@ -28,6 +29,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-machine-sme-abi"
 
+static cl::opt<int>
+    LoopEdgeWeight("aarch64-sme-abi-loop-edge-weight", cl::ReallyHidden,
+                   cl::init(10),
+                   cl::desc("Edge weight for basic blocks witin loops (used "
+                            "for placing ZA saves/restores)"));
+
 namespace {
 
 enum ZAState {
@@ -112,7 +119,8 @@ getInstNeededZAState(const TargetRegisterInfo &TRI, MachineInstr &MI,
 struct MachineSMEABI : public MachineFunctionPass {
   inline static char ID = 0;
 
-  MachineSMEABI() : MachineFunctionPass(ID) {}
+  MachineSMEABI(CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
+      : MachineFunctionPass(ID), OptLevel(OptLevel) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -121,6 +129,9 @@ struct MachineSMEABI : public MachineFunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<EdgeBundlesWrapperLegacy>();
+    // Only analyse loops at -01 and above.
+    if (OptLevel != CodeGenOptLevel::None)
+      AU.addRequired<MachineLoopInfoWrapperPass>();
     AU.addPreservedID(MachineLoopInfoID);
     AU.addPreservedID(MachineDominatorsID);
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -197,6 +208,8 @@ struct MachineSMEABI : public MachineFunctionPass {
     LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
   };
 
+  CodeGenOptLevel OptLevel = CodeGenOptLevel::Default;
+
   // All pass state that must be cleared between functions.
   struct PassState {
     SmallVector<BlockInfo> Blocks;
@@ -209,6 +222,7 @@ struct MachineSMEABI : public MachineFunctionPass {
   } State;
 
   EdgeBundles *Bundles = nullptr;
+  MachineLoopInfo *MLI = nullptr;
 };
 
 void MachineSMEABI::collectNeededZAStates(MachineFunction &MF,
@@ -302,18 +316,23 @@ void MachineSMEABI::pickBundleZAStates(MachineFunction &MF) {
         LLVM_DEBUG(dbgs() << " (no state preference)\n");
         continue;
       }
+      bool IsLoop = MLI && MLI->getLoopFor(MF.getBlockNumbered(BlockID));
       bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I;
       bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I;
+      int EdgeWeight = IsLoop ? LoopEdgeWeight : 1;
+      if (IsLoop)
+        LLVM_DEBUG(dbgs() << " IsLoop");
 
+      LLVM_DEBUG(dbgs() << " (EdgeWeight: " << EdgeWeight << ')');
       ZAState DesiredIncomingState = Block.Insts.front().NeededState;
       if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
-        EdgeStateCounts[DesiredIncomingState]++;
+        EdgeStateCounts[DesiredIncomingState] += EdgeWeight;
         LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
                           << getZAStateString(DesiredIncomingState));
       }
       ZAState DesiredOutgoingState = Block.Insts.front().NeededState;
       if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
-        EdgeStateCounts[DesiredOutgoingState]++;
+        EdgeStateCounts[DesiredOutgoingState] += EdgeWeight;
         LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
                           << getZAStateString(DesiredOutgoingState));
       }
@@ -771,6 +790,8 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
   // Reset pass state.
   State = PassState{};
   Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
+  if (OptLevel != CodeGenOptLevel::None)
+    MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
 
   bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
 
@@ -799,4 +820,6 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
+FunctionPass *llvm::createMachineSMEABIPass(CodeGenOptLevel OptLevel) {
+  return new MachineSMEABI(OptLevel);
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll
new file mode 100644
index 0000000000000..200280f52acb0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O0
+; RUN: llc -O1 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O1
+
+declare void @private_za_call()
+declare void @shared_za_call() "aarch64_inout_za"
+
+; This test checks that at -O0 we don't attempt to optimize lazy save state
+; changes in loops, and that -O1 (and above) we attempt to push state changes
+; out of loops.
+
+define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
+; CHECK-O0-LABEL: private_za_loop_active_entry_and_exit:
+; CHECK-O0:       // %bb.0: // %entry
+; CHECK-O0-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-O0-NEXT:    mov x29, sp
+; CHECK-O0-NEXT:    sub sp, sp, #32
+; CHECK-O0-NEXT:    rdsvl x9, #1
+; CHECK-O0-NEXT:    mov x8, sp
+; CHECK-O0-NEXT:    msub x8, x9, x9, x8
+; CHECK-O0-NEXT:    mov sp, x8
+; CHECK-O0-NEXT:    stp x8, x9, [x29, #-16]
+; CHECK-O0-NEXT:    stur w0, [x29, #-24] // 4-byte Folded Spill
+; CHECK-O0-NEXT:    bl shared_za_call
+; CHECK-O0-NEXT:    ldur w0, [x29, #-24] // 4-byte Folded Reload
+; CHECK-O0-NEXT:    mov w8, wzr
+; CHECK-O0-NEXT:    subs w9, w0, #1
+; CHECK-O0-NEXT:    stur w8, [x29, #-20] // 4-byte Folded Spill
+; CHECK-O0-NEXT:    b.lt .LBB0_4
+; CHECK-O0-NEXT:    b .LBB0_1
+; CHECK-O0-NEXT:  .LBB0_1: // %loop
+; CHECK-O0-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-O0-NEXT:    ldur w8, [x29, #-20] // 4-byte Folded Reload
+; CHECK-O0-NEXT:    stur w8, [x29, #-28] // 4-byte Folded Spill
+; CHECK-O0-NEXT:    sub x8, x29, #16
+; CHECK-O0-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-O0-NEXT:    bl private_za_call
+; CHECK-O0-NEXT:    ldur w8, [x29, #-28] // 4-byte Folded Reload
+; CHECK-O0-NEXT:    ldur w10, [x29, #-24] // 4-byte Folded Reload
+; CHECK-O0-NEXT:    add w9, w8, #1
+; CHECK-O0-NEXT:    mov w8, w9
+; CHECK-O0-NEXT:    subs w9, w9, w10
+; CHECK-O0-NEXT:    mrs x9, NZCV
+; CHECK-O0-NEXT:    smstart za
+; CHECK-O0-NEXT:    mrs x10, TPIDR2_EL0
+; CHECK-O0-NEXT:    sub x0, x29, #16
+; CHECK-O0-NEXT:    cbz x10, .LBB0_2
+; CHECK-O0-NEXT:    b .LBB0_3
+; CHECK-O0-NEXT:  .LBB0_2: // %loop
+; CHECK-O0-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; CHECK-O0-NEXT:    bl __arm_tpidr2_restore
+; CHECK-O0-NEXT:    b .LBB0_3
+; CHECK-O0-NEXT:  .LBB0_3: // %loop
+; CHECK-O0-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; CHECK-O0-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-O0-NEXT:    msr NZCV, x9
+; CHECK-O0-NEXT:    stur w8, [x29, #-20] // 4-byte Folded Spill
+; CHECK-O0-NEXT:    b.ne .LBB0_1
+; CHECK-O0-NEXT:    b .LBB0_4
+; CHECK-O0-NEXT:  .LBB0_4: // %exit
+; CHECK-O0-NEXT:    mov sp, x29
+; CHECK-O0-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-O0-NEXT:    b shared_za_call
+;
+; CHECK-O1-LABEL: private_za_loop_active_entry_and_exit:
+; CHECK-O1:       // %bb.0: // %entry
+; CHECK-O1-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-O1-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-O1-NEXT:    mov x29, sp
+; CHECK-O1-NEXT:    sub sp, sp, #16
+; CHECK-O1-NEXT:    rdsvl x8, #1
+; CHECK-O1-NEXT:    mov x9, sp
+; CHECK-O1-NEXT:    msub x9, x8, x8, x9
+; CHECK-O1-NEXT:    mov sp, x9
+; CHECK-O1-NEXT:    mov w19, w0
+; CHECK-O1-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-O1-NEXT:    bl shared_za_call
+; CHECK-O1-NEXT:    cmp w19, #1
+; CHECK-O1-NEXT:    sub x8, x29, #16
+; CHECK-O1-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-O1-NEXT:    b.lt .LBB0_2
+; CHECK-O1-NEXT:  .LBB0_1: // %loop
+; CHECK-O1-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-O1-NEXT:    bl private_za_call
+; CHECK-O1-NEXT:    subs w19, w19, #1
+; CHECK-O1-NEXT:    b.ne .LBB0_1
+; CHECK-O1-NEXT:  .LBB0_2: // %exit
+; CHECK-O1-NEXT:    smstart za
+; CHECK-O1-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-O1-NEXT:    sub x0, x29, #16
+; CHECK-O1-NEXT:    cbnz x8, .LBB0_4
+; CHECK-O1-NEXT:  // %bb.3: // %exit
+; CHECK-O1-NEXT:    bl __arm_tpidr2_restore
+; CHECK-O1-NEXT:  .LBB0_4: // %exit
+; CHECK-O1-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-O1-NEXT:    mov sp, x29
+; CHECK-O1-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-O1-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-O1-NEXT:    b shared_za_call
+entry:
+  %cmpgt = icmp sgt i32 %n, 0
+  tail call void @shared_za_call()
+  br i1 %cmpgt, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ]
+  tail call void @private_za_call()
+  %next_iv = add nuw nsw i32 %iv, 1
+  %cmpeq = icmp eq i32 %next_iv, %n
+  br i1 %cmpeq, label %exit, label %loop
+
+exit:
+  tail call void @shared_za_call()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
index d3d7e953bedfa..e9ef9d22aaba5 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
@@ -102,7 +102,7 @@ exit:
   ret void
 }
 
-; FIXME: In the new lowering we could weight edges to avoid doing the lazy save in the loop.
+; This tests that with the new lowering we push state changes out of loops (at -O1 and above).
 define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
 ; CHECK-LABEL: private_za_loop_active_entry_and_exit:
 ; CHECK:       // %bb.0: // %entry
@@ -154,7 +154,7 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
 ; CHECK-NEWLOWERING-LABEL: private_za_loop_active_entry_and_exit:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
 ; CHECK-NEWLOWERING-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEWLOWERING-NEXT:    mov x29, sp
 ; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16
 ; CHECK-NEWLOWERING-NEXT:    rdsvl x8, #1
@@ -165,30 +165,25 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
 ; CHECK-NEWLOWERING-NEXT:    stp x9, x8, [x29, #-16]
 ; CHECK-NEWLOWERING-NEXT:    bl shared_za_call
 ; CHECK-NEWLOWERING-NEXT:    cmp w19, #1
-; CHECK-NEWLOWERING-NEXT:    b.lt .LBB1_5
-; CHECK-NEWLOWERING-NEXT:  // %bb.1: // %loop.preheader
-; CHECK-NEWLOWERING-NEXT:    sub x20, x29, #16
-; CHECK-NEWLOWERING-NEXT:    b .LBB1_3
-; CHECK-NEWLOWERING-NEXT:  .LBB1_2: // %loop
-; CHECK-NEWLOWERING-NEXT:    // in Loop: Header=BB1_3 Depth=1
-; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT:    cbz w19, .LBB1_5
-; CHECK-NEWLOWERING-NEXT:  .LBB1_3: // %loop
+; CHECK-NEWLOWERING-NEXT:    sub x8, x29, #16
+; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-NEWLOWERING-NEXT:    b.lt .LBB1_2
+; CHECK-NEWLOWERING-NEXT:  .LBB1_1: // %loop
 ; CHECK-NEWLOWERING-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x20
 ; CHECK-NEWLOWERING-NEXT:    bl private_za_call
-; CHECK-NEWLOWERING-NEXT:    sub w19, w19, #1
+; CHECK-NEWLOWERING-NEXT:    subs w19, w19, #1
+; CHECK-NEWLOWERING-NEXT:    b.ne .LBB1_1
+; CHECK-NEWLOWERING-NEXT:  .LBB1_2: // %exit
 ; CHECK-NEWLOWERING-NEXT:    smstart za
 ; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB1_2
-; CHECK-NEWLOWERING-NEXT:  // %bb.4: // %loop
-; CHECK-NEWLOWERING-NEXT:    // in Loop: Header=BB1_3 Depth=1
+; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB1_4
+; CHECK-NEWLOWERING-NEXT:  // %bb.3: // %exit
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT:    b .LBB1_2
-; CHECK-NEWLOWERING-NEXT:  .LBB1_5: // %exit
+; CHECK-NEWLOWERING-NEXT:  .LBB1_4: // %exit
+; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEWLOWERING-NEXT:    mov sp, x29
-; CHECK-NEWLOWERING-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    b shared_za_call
 entry:

@MacDue MacDue requested a review from sdesmalen-arm July 16, 2025 10:46
@MacDue
Copy link
Member Author

MacDue commented Jul 16, 2025

Note: This patch is a minor improvement to placing saves/restores. For more complex programs, we will need to propagate required ZA states through blocks with "no preference" to make better decisions.

@MacDue MacDue force-pushed the users/MacDue/agnostic-ZA branch from 3fb2e45 to af4a764 Compare July 17, 2025 09:58
@MacDue MacDue force-pushed the users/MacDue/sme-loops branch from cab3409 to 2c9e14c Compare July 17, 2025 09:58
@MacDue MacDue force-pushed the users/MacDue/agnostic-ZA branch from af4a764 to 323b821 Compare August 5, 2025 11:03
@MacDue MacDue force-pushed the users/MacDue/sme-loops branch from 2c9e14c to f182168 Compare August 5, 2025 11:04
@MacDue MacDue force-pushed the users/MacDue/agnostic-ZA branch from 323b821 to 75b2bf0 Compare August 19, 2025 17:02
@MacDue MacDue force-pushed the users/MacDue/sme-loops branch from f182168 to 88c0bb6 Compare August 19, 2025 17:03
@MacDue MacDue force-pushed the users/MacDue/agnostic-ZA branch from 75b2bf0 to ea77b25 Compare August 20, 2025 13:43
@MacDue MacDue force-pushed the users/MacDue/sme-loops branch from 88c0bb6 to 2d5441c Compare August 20, 2025 13:43
@MacDue MacDue force-pushed the users/MacDue/agnostic-ZA branch from ea77b25 to ba4ddc7 Compare August 27, 2025 09:22
@MacDue MacDue force-pushed the users/MacDue/sme-loops branch from 2d5441c to 6a6ae1f Compare August 27, 2025 09:23
@MacDue MacDue force-pushed the users/MacDue/agnostic-ZA branch from ba4ddc7 to de812ce Compare August 27, 2025 15:12
@MacDue MacDue force-pushed the users/MacDue/sme-loops branch from 6a6ae1f to abfef3b Compare August 27, 2025 15:12
@MacDue MacDue force-pushed the users/MacDue/agnostic-ZA branch from a240cdb to 371598d Compare September 5, 2025 14:29
@MacDue MacDue force-pushed the users/MacDue/sme-loops branch from abfef3b to e380fb8 Compare September 5, 2025 14:31
@MacDue MacDue force-pushed the users/MacDue/agnostic-ZA branch from 371598d to 4242cb3 Compare September 9, 2025 09:58
Base automatically changed from users/MacDue/agnostic-ZA to main September 9, 2025 10:35
@MacDue MacDue force-pushed the users/MacDue/sme-loops branch from e380fb8 to f2e838b Compare September 9, 2025 13:30
MacDue added a commit that referenced this pull request Sep 9, 2025
Change-Id: Idef5b1e2a45585f97897fc11c4f237996edb7c8b
MacDue added a commit that referenced this pull request Sep 12, 2025
Change-Id: Idef5b1e2a45585f97897fc11c4f237996edb7c8b
@MacDue MacDue force-pushed the users/MacDue/sme-loops branch from f2e838b to b666a28 Compare September 12, 2025 11:25
@MacDue MacDue requested a review from SamTebbs33 September 30, 2025 11:32
Copy link
Collaborator

@SamTebbs33 SamTebbs33 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

MacDue added a commit that referenced this pull request Oct 2, 2025
Change-Id: Idef5b1e2a45585f97897fc11c4f237996edb7c8b
@MacDue MacDue force-pushed the users/MacDue/sme-loops branch from b666a28 to 03ee23f Compare October 2, 2025 11:39
MacDue added a commit that referenced this pull request Oct 15, 2025
Change-Id: Idef5b1e2a45585f97897fc11c4f237996edb7c8b
This patch uses the MachineLoopInfo to give blocks within loops a higher
weight when choosing the bundle ZA state. MachineLoopInfo does not find
loop trip counts, so this uses an arbitrary weight (default 10), which
can be configured with the `-aarch64-sme-abi-loop-edge-weight` flag.

This makes the MachineSMEABIPass pass more likely to pick a bundle state
that matches the loop's entry/exit state, which avoids state changes in
the loop (which we assume will happen more than once).

This does require some extra analysis, so this is only enabled at -O1
and above.

Change-Id: If318c809d2f7cc1fca144fbe424ba2a2ca7fb19f
@MacDue MacDue force-pushed the users/MacDue/sme-loops branch from 03ee23f to 72b8956 Compare October 28, 2025 10:18
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants