Skip to content

Commit 72fc8f4

Browse files
committed
[AArch64][SME] Simplify initialization of TPIDR2 block
This patch updates the definition of `AArch64ISD::INIT_TPIDR2OBJ` to take the number of save slices (which is currently always all ZA slices). Using this, we can initialize the TPIDR2 block with a single STP of the save buffer pointer and the number of save slices. The reserved bytes (10-15) will be implicitly zeroed as the result of RDSVL will always be <= 16-bits. Using an STP is also possible for big-endian targets with an additional left shift. Note: We used to write the number of save slices to the TPIDR2 block before every call with a lazy save; however, based on 6.6.9 "Changes to the TPIDR2 block" in the aapcs64 (https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#changes-to-the-tpidr2-block), it seems we can rely on callers preserving the contents of the TPIDR2 block.
1 parent d606eae commit 72fc8f4

File tree

10 files changed

+149
-138
lines changed

10 files changed

+149
-138
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2985,20 +2985,24 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
29852985
TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
29862986
if (TPIDR2.Uses > 0) {
29872987
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2988-
// Store the buffer pointer to the TPIDR2 stack object.
2989-
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
2988+
unsigned TPIDInitSaveSlicesReg = MI.getOperand(1).getReg();
2989+
if (!Subtarget->isLittleEndian()) {
2990+
unsigned TmpReg =
2991+
MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
2992+
// For big-endian targets move "num_za_save_slices" to the top two bytes.
2993+
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::UBFMXri), TmpReg)
2994+
.addReg(TPIDInitSaveSlicesReg)
2995+
.addImm(16)
2996+
.addImm(15);
2997+
TPIDInitSaveSlicesReg = TmpReg;
2998+
}
2999+
// Store buffer pointer and num_za_save_slices.
3000+
// Bytes 10-15 are implicitly zeroed.
3001+
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
29903002
.addReg(MI.getOperand(0).getReg())
3003+
.addReg(TPIDInitSaveSlicesReg)
29913004
.addFrameIndex(TPIDR2.FrameIndex)
29923005
.addImm(0);
2993-
// Set the reserved bytes (10-15) to zero
2994-
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
2995-
.addReg(AArch64::WZR)
2996-
.addFrameIndex(TPIDR2.FrameIndex)
2997-
.addImm(5);
2998-
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
2999-
.addReg(AArch64::WZR)
3000-
.addFrameIndex(TPIDR2.FrameIndex)
3001-
.addImm(3);
30023006
} else
30033007
MFI.RemoveStackObject(TPIDR2.FrameIndex);
30043008

@@ -8313,9 +8317,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
83138317
{Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
83148318
MFI.CreateVariableSizedObject(Align(16), nullptr);
83158319
}
8320+
SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8321+
DAG.getConstant(1, DL, MVT::i32));
83168322
Chain = DAG.getNode(
83178323
AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8318-
{/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
8324+
{/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
8325+
/*Num save slices*/ NumZaSaveSlices});
83198326
} else if (Attrs.hasAgnosticZAInterface()) {
83208327
// Call __arm_sme_state_size().
83218328
SDValue BufferSize =
@@ -9165,19 +9172,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
91659172
bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
91669173
bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
91679174
if (RequiresLazySave) {
9168-
const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9169-
MachinePointerInfo MPI =
9170-
MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex);
9175+
TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
91719176
SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
91729177
TPIDR2.FrameIndex,
91739178
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
9174-
SDValue NumZaSaveSlicesAddr =
9175-
DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
9176-
DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
9177-
SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9178-
DAG.getConstant(1, DL, MVT::i32));
9179-
Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
9180-
MPI, MVT::i16);
91819179
Chain = DAG.getNode(
91829180
ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
91839181
DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,10 @@ let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {
6161
def : Pat<(i64 (AArch64AllocateZABuffer GPR64:$size)),
6262
(AllocateZABuffer $size)>;
6363

64-
def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1,
65-
[SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>;
64+
def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 2,
65+
[SDTCisInt<0>, SDTCisInt<1>]>, [SDNPHasChain, SDNPMayStore]>;
6666
let usesCustomInserter = 1 in {
67-
def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer), [(AArch64InitTPIDR2Obj GPR64:$buffer)]>, Sched<[WriteI]> {}
67+
def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer, GPR64:$save_slices), [(AArch64InitTPIDR2Obj GPR64:$buffer, GPR64:$save_slices)]>, Sched<[WriteI]> {}
6868
}
6969

7070
// Nodes to allocate a save buffer for SME.

llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -250,10 +250,7 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
250250
; CHECK-COMMON-NEXT: mov x9, sp
251251
; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
252252
; CHECK-COMMON-NEXT: mov sp, x9
253-
; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
254-
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
255-
; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
256-
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
253+
; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16]
257254
; CHECK-COMMON-NEXT: sub x8, x29, #16
258255
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8
259256
; CHECK-COMMON-NEXT: bl normal_callee
@@ -292,12 +289,9 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
292289
; CHECK-COMMON-NEXT: mov x9, sp
293290
; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
294291
; CHECK-COMMON-NEXT: mov sp, x9
295-
; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
296-
; CHECK-COMMON-NEXT: sub x9, x29, #16
297-
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
298-
; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
299-
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
300-
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9
292+
; CHECK-COMMON-NEXT: sub x10, x29, #16
293+
; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16]
294+
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10
301295
; CHECK-COMMON-NEXT: bl __addtf3
302296
; CHECK-COMMON-NEXT: smstart za
303297
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
@@ -356,12 +350,9 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
356350
; CHECK-COMMON-NEXT: mov x9, sp
357351
; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
358352
; CHECK-COMMON-NEXT: mov sp, x9
359-
; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
360-
; CHECK-COMMON-NEXT: sub x9, x29, #16
361-
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
362-
; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
363-
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
364-
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9
353+
; CHECK-COMMON-NEXT: sub x10, x29, #16
354+
; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16]
355+
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10
365356
; CHECK-COMMON-NEXT: bl fmod
366357
; CHECK-COMMON-NEXT: smstart za
367358
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0

llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll

Lines changed: 21 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,9 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
2020
; CHECK-NEXT: mov x9, sp
2121
; CHECK-NEXT: msub x9, x8, x8, x9
2222
; CHECK-NEXT: mov sp, x9
23-
; CHECK-NEXT: stur x9, [x29, #-16]
24-
; CHECK-NEXT: sub x9, x29, #16
25-
; CHECK-NEXT: sturh wzr, [x29, #-6]
26-
; CHECK-NEXT: stur wzr, [x29, #-4]
27-
; CHECK-NEXT: sturh w8, [x29, #-8]
28-
; CHECK-NEXT: msr TPIDR2_EL0, x9
23+
; CHECK-NEXT: sub x10, x29, #16
24+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
25+
; CHECK-NEXT: msr TPIDR2_EL0, x10
2926
; CHECK-NEXT: bl private_za_callee
3027
; CHECK-NEXT: smstart za
3128
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -74,21 +71,17 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
7471
define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
7572
; CHECK-LABEL: test_lazy_save_2_callees:
7673
; CHECK: // %bb.0:
77-
; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
78-
; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill
74+
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
75+
; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
7976
; CHECK-NEXT: mov x29, sp
80-
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
8177
; CHECK-NEXT: sub sp, sp, #16
82-
; CHECK-NEXT: rdsvl x20, #1
83-
; CHECK-NEXT: mov x8, sp
84-
; CHECK-NEXT: msub x8, x20, x20, x8
85-
; CHECK-NEXT: mov sp, x8
86-
; CHECK-NEXT: sub x21, x29, #16
87-
; CHECK-NEXT: stur x8, [x29, #-16]
88-
; CHECK-NEXT: sturh wzr, [x29, #-6]
89-
; CHECK-NEXT: stur wzr, [x29, #-4]
90-
; CHECK-NEXT: sturh w20, [x29, #-8]
91-
; CHECK-NEXT: msr TPIDR2_EL0, x21
78+
; CHECK-NEXT: rdsvl x8, #1
79+
; CHECK-NEXT: mov x9, sp
80+
; CHECK-NEXT: msub x9, x8, x8, x9
81+
; CHECK-NEXT: mov sp, x9
82+
; CHECK-NEXT: sub x20, x29, #16
83+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
84+
; CHECK-NEXT: msr TPIDR2_EL0, x20
9285
; CHECK-NEXT: bl private_za_callee
9386
; CHECK-NEXT: smstart za
9487
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -98,8 +91,7 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
9891
; CHECK-NEXT: bl __arm_tpidr2_restore
9992
; CHECK-NEXT: .LBB1_2:
10093
; CHECK-NEXT: msr TPIDR2_EL0, xzr
101-
; CHECK-NEXT: sturh w20, [x29, #-8]
102-
; CHECK-NEXT: msr TPIDR2_EL0, x21
94+
; CHECK-NEXT: msr TPIDR2_EL0, x20
10395
; CHECK-NEXT: bl private_za_callee
10496
; CHECK-NEXT: smstart za
10597
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -110,9 +102,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
110102
; CHECK-NEXT: .LBB1_4:
111103
; CHECK-NEXT: msr TPIDR2_EL0, xzr
112104
; CHECK-NEXT: mov sp, x29
113-
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
114-
; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload
115-
; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload
105+
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
106+
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
116107
; CHECK-NEXT: ret
117108
;
118109
; CHECK-NEWLOWERING-LABEL: test_lazy_save_2_callees:
@@ -159,12 +150,9 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
159150
; CHECK-NEXT: mov x9, sp
160151
; CHECK-NEXT: msub x9, x8, x8, x9
161152
; CHECK-NEXT: mov sp, x9
162-
; CHECK-NEXT: stur x9, [x29, #-16]
163-
; CHECK-NEXT: sub x9, x29, #16
164-
; CHECK-NEXT: sturh wzr, [x29, #-6]
165-
; CHECK-NEXT: stur wzr, [x29, #-4]
166-
; CHECK-NEXT: sturh w8, [x29, #-8]
167-
; CHECK-NEXT: msr TPIDR2_EL0, x9
153+
; CHECK-NEXT: sub x10, x29, #16
154+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
155+
; CHECK-NEXT: msr TPIDR2_EL0, x10
168156
; CHECK-NEXT: bl cosf
169157
; CHECK-NEXT: smstart za
170158
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -227,12 +215,9 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
227215
; CHECK-NEXT: mov x20, x0
228216
; CHECK-NEXT: msub x9, x8, x8, x9
229217
; CHECK-NEXT: mov sp, x9
230-
; CHECK-NEXT: stur x9, [x29, #-80]
231-
; CHECK-NEXT: sub x9, x29, #80
232-
; CHECK-NEXT: sturh wzr, [x29, #-70]
233-
; CHECK-NEXT: stur wzr, [x29, #-68]
234-
; CHECK-NEXT: sturh w8, [x29, #-72]
235-
; CHECK-NEXT: msr TPIDR2_EL0, x9
218+
; CHECK-NEXT: sub x10, x29, #80
219+
; CHECK-NEXT: stp x9, x8, [x29, #-80]
220+
; CHECK-NEXT: msr TPIDR2_EL0, x10
236221
; CHECK-NEXT: tbz w20, #0, .LBB3_2
237222
; CHECK-NEXT: // %bb.1:
238223
; CHECK-NEXT: smstop sm

llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,9 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
1515
; CHECK-NEXT: mov x9, sp
1616
; CHECK-NEXT: msub x9, x8, x8, x9
1717
; CHECK-NEXT: mov sp, x9
18-
; CHECK-NEXT: stur x9, [x29, #-16]
19-
; CHECK-NEXT: sub x9, x29, #16
20-
; CHECK-NEXT: sturh wzr, [x29, #-6]
21-
; CHECK-NEXT: stur wzr, [x29, #-4]
22-
; CHECK-NEXT: sturh w8, [x29, #-8]
23-
; CHECK-NEXT: msr TPIDR2_EL0, x9
18+
; CHECK-NEXT: sub x10, x29, #16
19+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
20+
; CHECK-NEXT: msr TPIDR2_EL0, x10
2421
; CHECK-NEXT: bl private_za_callee
2522
; CHECK-NEXT: smstart za
2623
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -73,12 +70,9 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
7370
; CHECK-NEXT: mov x9, sp
7471
; CHECK-NEXT: msub x9, x8, x8, x9
7572
; CHECK-NEXT: mov sp, x9
76-
; CHECK-NEXT: stur x9, [x29, #-16]
77-
; CHECK-NEXT: sub x9, x29, #16
78-
; CHECK-NEXT: sturh wzr, [x29, #-6]
79-
; CHECK-NEXT: stur wzr, [x29, #-4]
80-
; CHECK-NEXT: sturh w8, [x29, #-8]
81-
; CHECK-NEXT: msr TPIDR2_EL0, x9
73+
; CHECK-NEXT: sub x10, x29, #16
74+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
75+
; CHECK-NEXT: msr TPIDR2_EL0, x10
8276
; CHECK-NEXT: bl __addtf3
8377
; CHECK-NEXT: smstart za
8478
; CHECK-NEXT: mrs x8, TPIDR2_EL0
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s
3+
; RUN: llc -mtriple=aarch64_be -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s --check-prefix=CHECK-BE
4+
5+
declare void @private_za_callee()
6+
declare float @llvm.cos.f32(float)
7+
8+
; Test TPIDR2_EL0 is initialized correctly for AArch64 big-endian.
9+
define void @test_tpidr2_init() nounwind "aarch64_inout_za" {
10+
; CHECK-LABEL: test_tpidr2_init:
11+
; CHECK: // %bb.0:
12+
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
13+
; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
14+
; CHECK-NEXT: mov x29, sp
15+
; CHECK-NEXT: sub sp, sp, #16
16+
; CHECK-NEXT: rdsvl x8, #1
17+
; CHECK-NEXT: mov x9, sp
18+
; CHECK-NEXT: msub x9, x8, x8, x9
19+
; CHECK-NEXT: mov sp, x9
20+
; CHECK-NEXT: sub x10, x29, #16
21+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
22+
; CHECK-NEXT: msr TPIDR2_EL0, x10
23+
; CHECK-NEXT: bl private_za_callee
24+
; CHECK-NEXT: smstart za
25+
; CHECK-NEXT: mrs x8, TPIDR2_EL0
26+
; CHECK-NEXT: sub x0, x29, #16
27+
; CHECK-NEXT: cbnz x8, .LBB0_2
28+
; CHECK-NEXT: // %bb.1:
29+
; CHECK-NEXT: bl __arm_tpidr2_restore
30+
; CHECK-NEXT: .LBB0_2:
31+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
32+
; CHECK-NEXT: mov sp, x29
33+
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
34+
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
35+
; CHECK-NEXT: ret
36+
;
37+
; CHECK-BE-LABEL: test_tpidr2_init:
38+
; CHECK-BE: // %bb.0:
39+
; CHECK-BE-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
40+
; CHECK-BE-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
41+
; CHECK-BE-NEXT: mov x29, sp
42+
; CHECK-BE-NEXT: sub sp, sp, #16
43+
; CHECK-BE-NEXT: rdsvl x8, #1
44+
; CHECK-BE-NEXT: mov x9, sp
45+
; CHECK-BE-NEXT: msub x9, x8, x8, x9
46+
; CHECK-BE-NEXT: mov sp, x9
47+
; CHECK-BE-NEXT: lsl x8, x8, #48
48+
; CHECK-BE-NEXT: sub x10, x29, #16
49+
; CHECK-BE-NEXT: stp x9, x8, [x29, #-16]
50+
; CHECK-BE-NEXT: msr TPIDR2_EL0, x10
51+
; CHECK-BE-NEXT: bl private_za_callee
52+
; CHECK-BE-NEXT: smstart za
53+
; CHECK-BE-NEXT: mrs x8, TPIDR2_EL0
54+
; CHECK-BE-NEXT: sub x0, x29, #16
55+
; CHECK-BE-NEXT: cbnz x8, .LBB0_2
56+
; CHECK-BE-NEXT: // %bb.1:
57+
; CHECK-BE-NEXT: bl __arm_tpidr2_restore
58+
; CHECK-BE-NEXT: .LBB0_2:
59+
; CHECK-BE-NEXT: msr TPIDR2_EL0, xzr
60+
; CHECK-BE-NEXT: mov sp, x29
61+
; CHECK-BE-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
62+
; CHECK-BE-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
63+
; CHECK-BE-NEXT: ret
64+
call void @private_za_callee()
65+
ret void
66+
}

llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,18 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6
2222
; CHECK-NEXT: .cfi_offset w29, -16
2323
; CHECK-NEXT: rdsvl x8, #1
2424
; CHECK-NEXT: mov x9, sp
25-
; CHECK-NEXT: msub x8, x8, x8, x9
26-
; CHECK-NEXT: mov sp, x8
27-
; CHECK-NEXT: stur x8, [x29, #-16]
28-
; CHECK-NEXT: sturh wzr, [x29, #-6]
29-
; CHECK-NEXT: stur wzr, [x29, #-4]
25+
; CHECK-NEXT: msub x9, x8, x8, x9
26+
; CHECK-NEXT: mov sp, x9
27+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
3028
; CHECK-NEXT: cbz w0, .LBB1_2
3129
; CHECK-NEXT: // %bb.1: // %use_b
3230
; CHECK-NEXT: fmov s1, #4.00000000
3331
; CHECK-NEXT: fadd s0, s0, s1
3432
; CHECK-NEXT: b .LBB1_5
3533
; CHECK-NEXT: .LBB1_2: // %use_c
3634
; CHECK-NEXT: fmov s0, s1
37-
; CHECK-NEXT: rdsvl x8, #1
38-
; CHECK-NEXT: sub x9, x29, #16
39-
; CHECK-NEXT: sturh w8, [x29, #-8]
40-
; CHECK-NEXT: msr TPIDR2_EL0, x9
35+
; CHECK-NEXT: sub x8, x29, #16
36+
; CHECK-NEXT: msr TPIDR2_EL0, x8
4137
; CHECK-NEXT: bl cosf
4238
; CHECK-NEXT: smstart za
4339
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -115,31 +111,27 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
115111
; CHECK-NEXT: .cfi_offset w29, -16
116112
; CHECK-NEXT: rdsvl x8, #1
117113
; CHECK-NEXT: mov x9, sp
118-
; CHECK-NEXT: msub x8, x8, x8, x9
114+
; CHECK-NEXT: msub x9, x8, x8, x9
119115
; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
120116
; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
121-
; CHECK-NEXT: cmp sp, x8
117+
; CHECK-NEXT: cmp sp, x9
122118
; CHECK-NEXT: b.le .LBB2_3
123119
; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
124120
; CHECK-NEXT: str xzr, [sp]
125121
; CHECK-NEXT: b .LBB2_1
126122
; CHECK-NEXT: .LBB2_3:
127-
; CHECK-NEXT: mov sp, x8
123+
; CHECK-NEXT: mov sp, x9
128124
; CHECK-NEXT: ldr xzr, [sp]
129-
; CHECK-NEXT: stur x8, [x29, #-16]
130-
; CHECK-NEXT: sturh wzr, [x29, #-6]
131-
; CHECK-NEXT: stur wzr, [x29, #-4]
125+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
132126
; CHECK-NEXT: cbz w0, .LBB2_5
133127
; CHECK-NEXT: // %bb.4: // %use_b
134128
; CHECK-NEXT: fmov s1, #4.00000000
135129
; CHECK-NEXT: fadd s0, s0, s1
136130
; CHECK-NEXT: b .LBB2_8
137131
; CHECK-NEXT: .LBB2_5: // %use_c
138132
; CHECK-NEXT: fmov s0, s1
139-
; CHECK-NEXT: rdsvl x8, #1
140-
; CHECK-NEXT: sub x9, x29, #16
141-
; CHECK-NEXT: sturh w8, [x29, #-8]
142-
; CHECK-NEXT: msr TPIDR2_EL0, x9
133+
; CHECK-NEXT: sub x8, x29, #16
134+
; CHECK-NEXT: msr TPIDR2_EL0, x8
143135
; CHECK-NEXT: bl cosf
144136
; CHECK-NEXT: smstart za
145137
; CHECK-NEXT: mrs x8, TPIDR2_EL0

0 commit comments

Comments
 (0)