Skip to content

Commit b3dfbc4

Browse files
committed
[AArch64] Fix Windows prologue handling to pair more registers.
Currently, there's code to suppress pairing, but we don't actually need to suppress that; we just need to suppress the formation of pre-decrement/post-increment instructions. Pairing saves an instruction in some cases, and enables packed unwind in some cases. (There's a comment in the code noting we could enable packed unwind in more cases, but that's not clearly profitable.)
1 parent e7748e9 commit b3dfbc4

File tree

6 files changed

+106
-111
lines changed

6 files changed

+106
-111
lines changed

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,7 +1556,6 @@ static bool produceCompactUnwindFrame(const AArch64FrameLowering &AFL,
15561556
static bool invalidateWindowsRegisterPairing(bool SpillExtendedVolatile,
15571557
unsigned SpillCount, unsigned Reg1,
15581558
unsigned Reg2, bool NeedsWinCFI,
1559-
bool IsFirst,
15601559
const TargetRegisterInfo *TRI) {
15611560
// If we are generating register pairs for a Windows function that requires
15621561
// EH support, then pair consecutive registers only. There are no unwind
@@ -1582,12 +1581,9 @@ static bool invalidateWindowsRegisterPairing(bool SpillExtendedVolatile,
15821581
: false;
15831582

15841583
// If pairing a GPR with LR, the pair can be described by the save_lrpair
1585-
// opcode. If this is the first register pair, it would end up with a
1586-
// predecrement, but there's no save_lrpair_x opcode, so we can only do this
1587-
// if LR is paired with something else than the first register.
1588-
// The save_lrpair opcode requires the first register to be an odd one.
1584+
// opcode. The save_lrpair opcode requires the first register to be odd.
15891585
if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
1590-
(Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
1586+
(Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR)
15911587
return false;
15921588
return true;
15931589
}
@@ -1600,12 +1596,10 @@ static bool invalidateRegisterPairing(bool SpillExtendedVolatile,
16001596
unsigned SpillCount, unsigned Reg1,
16011597
unsigned Reg2, bool UsesWinAAPCS,
16021598
bool NeedsWinCFI, bool NeedsFrameRecord,
1603-
bool IsFirst,
16041599
const TargetRegisterInfo *TRI) {
16051600
if (UsesWinAAPCS)
16061601
return invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount,
1607-
Reg1, Reg2, NeedsWinCFI, IsFirst,
1608-
TRI);
1602+
Reg1, Reg2, NeedsWinCFI, TRI);
16091603

16101604
// If we need to store the frame record, don't pair any register
16111605
// with LR other than FP.
@@ -1775,21 +1769,20 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
17751769
// Add the next reg to the pair if it is in the same register class.
17761770
if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) {
17771771
MCRegister NextReg = CSI[i + RegInc].getReg();
1778-
bool IsFirst = i == FirstReg;
17791772
unsigned SpillCount = NeedsWinCFI ? FirstReg - i : i;
17801773
switch (RPI.Type) {
17811774
case RegPairInfo::GPR:
17821775
if (AArch64::GPR64RegClass.contains(NextReg) &&
1783-
!invalidateRegisterPairing(
1784-
SpillExtendedVolatile, SpillCount, RPI.Reg1, NextReg, IsWindows,
1785-
NeedsWinCFI, NeedsFrameRecord, IsFirst, TRI))
1776+
!invalidateRegisterPairing(SpillExtendedVolatile, SpillCount,
1777+
RPI.Reg1, NextReg, IsWindows,
1778+
NeedsWinCFI, NeedsFrameRecord, TRI))
17861779
RPI.Reg2 = NextReg;
17871780
break;
17881781
case RegPairInfo::FPR64:
17891782
if (AArch64::FPR64RegClass.contains(NextReg) &&
17901783
!invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount,
17911784
RPI.Reg1, NextReg, NeedsWinCFI,
1792-
IsFirst, TRI))
1785+
TRI))
17931786
RPI.Reg2 = NextReg;
17941787
break;
17951788
case RegPairInfo::FPR128:

llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,11 +179,23 @@ AArch64PrologueEpilogueCommon::convertCalleeSaveRestoreToSPPrePostIncDec(
179179
(void)Success;
180180
assert(Success && "unknown load/store opcode");
181181

182+
const auto *TRI = Subtarget.getRegisterInfo();
182183
// If the first store isn't right where we want SP then we can't fold the
183184
// update in so create a normal arithmetic instruction instead.
185+
//
186+
// On Windows, some register pairs involving LR can't be folded because
187+
// there isn't a corresponding unwind opcode. (Note that packed unwind expects
188+
// a sequence like "sub sp, sp, #16; stp x19, lr, [sp]; sub sp, sp, #16",
189+
// but we currently generate "sub sp, sp, #32; stp x19, lr, [sp, #16]". We
190+
// could handle that here, but it's not clearly profitable; it saves up to
191+
// 4 words of xdata, but it costs 2 instructions.)
184192
if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
185193
CSStackSizeInc < MinOffset * (int64_t)Scale.getFixedValue() ||
186-
CSStackSizeInc > MaxOffset * (int64_t)Scale.getFixedValue()) {
194+
CSStackSizeInc > MaxOffset * (int64_t)Scale.getFixedValue() ||
195+
(NeedsWinCFI &&
196+
(NewOpc == AArch64::LDPXpost || NewOpc == AArch64::STPXpre) &&
197+
TRI->getEncodingValue(MBBI->getOperand(0).getReg()) + 1 !=
198+
TRI->getEncodingValue(MBBI->getOperand(1).getReg()))) {
187199
// If we are destroying the frame, make sure we add the increment after the
188200
// last frame operation.
189201
if (FrameFlag == MachineInstr::FrameDestroy) {

llvm/test/CodeGen/AArch64/arm64-windows-calls.ll

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -143,21 +143,21 @@ define void @call_copy_pod() {
143143
; CHECK-LABEL: call_copy_pod:
144144
; CHECK: .seh_proc call_copy_pod
145145
; CHECK-NEXT: // %bb.0:
146-
; CHECK-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill
147-
; CHECK-NEXT: .seh_save_reg_x x19, 16
148-
; CHECK-NEXT: str x30, [sp, #8] // 8-byte Spill
149-
; CHECK-NEXT: .seh_save_reg x30, 8
146+
; CHECK-NEXT: sub sp, sp, #16
147+
; CHECK-NEXT: .seh_stackalloc 16
148+
; CHECK-NEXT: stp x19, x30, [sp] // 16-byte Folded Spill
149+
; CHECK-NEXT: .seh_save_lrpair x19, 0
150150
; CHECK-NEXT: .seh_endprologue
151151
; CHECK-NEXT: adrp x19, Pod
152152
; CHECK-NEXT: add x19, x19, :lo12:Pod
153153
; CHECK-NEXT: mov x0, x19
154154
; CHECK-NEXT: bl copy_pod
155155
; CHECK-NEXT: stp d0, d1, [x19]
156156
; CHECK-NEXT: .seh_startepilogue
157-
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Reload
158-
; CHECK-NEXT: .seh_save_reg x30, 8
159-
; CHECK-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload
160-
; CHECK-NEXT: .seh_save_reg_x x19, 16
157+
; CHECK-NEXT: ldp x19, x30, [sp] // 16-byte Folded Reload
158+
; CHECK-NEXT: .seh_save_lrpair x19, 0
159+
; CHECK-NEXT: add sp, sp, #16
160+
; CHECK-NEXT: .seh_stackalloc 16
161161
; CHECK-NEXT: .seh_endepilogue
162162
; CHECK-NEXT: ret
163163
; CHECK-NEXT: .seh_endfunclet
@@ -175,10 +175,8 @@ define void @call_copy_notcxx14aggregate() {
175175
; CHECK-NEXT: // %bb.0:
176176
; CHECK-NEXT: sub sp, sp, #32
177177
; CHECK-NEXT: .seh_stackalloc 32
178-
; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill
179-
; CHECK-NEXT: .seh_save_reg x19, 16
180-
; CHECK-NEXT: str x30, [sp, #24] // 8-byte Spill
181-
; CHECK-NEXT: .seh_save_reg x30, 24
178+
; CHECK-NEXT: stp x19, x30, [sp, #16] // 16-byte Folded Spill
179+
; CHECK-NEXT: .seh_save_lrpair x19, 16
182180
; CHECK-NEXT: .seh_endprologue
183181
; CHECK-NEXT: adrp x19, NotCXX14Aggregate
184182
; CHECK-NEXT: add x19, x19, :lo12:NotCXX14Aggregate
@@ -188,10 +186,8 @@ define void @call_copy_notcxx14aggregate() {
188186
; CHECK-NEXT: ldp d0, d1, [sp]
189187
; CHECK-NEXT: stp d0, d1, [x19]
190188
; CHECK-NEXT: .seh_startepilogue
191-
; CHECK-NEXT: ldr x30, [sp, #24] // 8-byte Reload
192-
; CHECK-NEXT: .seh_save_reg x30, 24
193-
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
194-
; CHECK-NEXT: .seh_save_reg x19, 16
189+
; CHECK-NEXT: ldp x19, x30, [sp, #16] // 16-byte Folded Reload
190+
; CHECK-NEXT: .seh_save_lrpair x19, 16
195191
; CHECK-NEXT: add sp, sp, #32
196192
; CHECK-NEXT: .seh_stackalloc 32
197193
; CHECK-NEXT: .seh_endepilogue
@@ -211,21 +207,21 @@ define void @call_copy_notpod() {
211207
; CHECK-LABEL: call_copy_notpod:
212208
; CHECK: .seh_proc call_copy_notpod
213209
; CHECK-NEXT: // %bb.0:
214-
; CHECK-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill
215-
; CHECK-NEXT: .seh_save_reg_x x19, 16
216-
; CHECK-NEXT: str x30, [sp, #8] // 8-byte Spill
217-
; CHECK-NEXT: .seh_save_reg x30, 8
210+
; CHECK-NEXT: sub sp, sp, #16
211+
; CHECK-NEXT: .seh_stackalloc 16
212+
; CHECK-NEXT: stp x19, x30, [sp] // 16-byte Folded Spill
213+
; CHECK-NEXT: .seh_save_lrpair x19, 0
218214
; CHECK-NEXT: .seh_endprologue
219215
; CHECK-NEXT: adrp x19, NotPod
220216
; CHECK-NEXT: add x19, x19, :lo12:NotPod
221217
; CHECK-NEXT: mov x0, x19
222218
; CHECK-NEXT: bl copy_notpod
223219
; CHECK-NEXT: stp x0, x1, [x19]
224220
; CHECK-NEXT: .seh_startepilogue
225-
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Reload
226-
; CHECK-NEXT: .seh_save_reg x30, 8
227-
; CHECK-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload
228-
; CHECK-NEXT: .seh_save_reg_x x19, 16
221+
; CHECK-NEXT: ldp x19, x30, [sp] // 16-byte Folded Reload
222+
; CHECK-NEXT: .seh_save_lrpair x19, 0
223+
; CHECK-NEXT: add sp, sp, #16
224+
; CHECK-NEXT: .seh_stackalloc 16
229225
; CHECK-NEXT: .seh_endepilogue
230226
; CHECK-NEXT: ret
231227
; CHECK-NEXT: .seh_endfunclet

llvm/test/CodeGen/AArch64/win64_vararg2.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,8 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) {
99
; CHECK-NEXT: // %bb.0:
1010
; CHECK-NEXT: sub sp, sp, #80
1111
; CHECK-NEXT: .seh_stackalloc 80
12-
; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill
13-
; CHECK-NEXT: .seh_save_reg x19, 16
14-
; CHECK-NEXT: str x30, [sp, #24] // 8-byte Spill
15-
; CHECK-NEXT: .seh_save_reg x30, 24
12+
; CHECK-NEXT: stp x19, x30, [sp, #16] // 16-byte Folded Spill
13+
; CHECK-NEXT: .seh_save_lrpair x19, 16
1614
; CHECK-NEXT: .seh_endprologue
1715
; CHECK-NEXT: add x8, sp, #40
1816
; CHECK-NEXT: mov w19, w0
@@ -27,10 +25,8 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) {
2725
; CHECK-NEXT: cmp w19, w0
2826
; CHECK-NEXT: cset w0, ls
2927
; CHECK-NEXT: .seh_startepilogue
30-
; CHECK-NEXT: ldr x30, [sp, #24] // 8-byte Reload
31-
; CHECK-NEXT: .seh_save_reg x30, 24
32-
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
33-
; CHECK-NEXT: .seh_save_reg x19, 16
28+
; CHECK-NEXT: ldp x19, x30, [sp, #16] // 16-byte Folded Reload
29+
; CHECK-NEXT: .seh_save_lrpair x19, 16
3430
; CHECK-NEXT: add sp, sp, #80
3531
; CHECK-NEXT: .seh_stackalloc 80
3632
; CHECK-NEXT: .seh_endepilogue
@@ -43,10 +39,8 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) {
4339
; GISEL-NEXT: // %bb.0:
4440
; GISEL-NEXT: sub sp, sp, #80
4541
; GISEL-NEXT: .seh_stackalloc 80
46-
; GISEL-NEXT: str x19, [sp, #16] // 8-byte Spill
47-
; GISEL-NEXT: .seh_save_reg x19, 16
48-
; GISEL-NEXT: str x30, [sp, #24] // 8-byte Spill
49-
; GISEL-NEXT: .seh_save_reg x30, 24
42+
; GISEL-NEXT: stp x19, x30, [sp, #16] // 16-byte Folded Spill
43+
; GISEL-NEXT: .seh_save_lrpair x19, 16
5044
; GISEL-NEXT: .seh_endprologue
5145
; GISEL-NEXT: add x8, sp, #40
5246
; GISEL-NEXT: mov w19, w0
@@ -61,10 +55,8 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) {
6155
; GISEL-NEXT: cmp w19, w0
6256
; GISEL-NEXT: cset w0, ls
6357
; GISEL-NEXT: .seh_startepilogue
64-
; GISEL-NEXT: ldr x30, [sp, #24] // 8-byte Reload
65-
; GISEL-NEXT: .seh_save_reg x30, 24
66-
; GISEL-NEXT: ldr x19, [sp, #16] // 8-byte Reload
67-
; GISEL-NEXT: .seh_save_reg x19, 16
58+
; GISEL-NEXT: ldp x19, x30, [sp, #16] // 16-byte Folded Reload
59+
; GISEL-NEXT: .seh_save_lrpair x19, 16
6860
; GISEL-NEXT: add sp, sp, #80
6961
; GISEL-NEXT: .seh_stackalloc 80
7062
; GISEL-NEXT: .seh_endepilogue

llvm/test/CodeGen/AArch64/wineh-pac.ll

Lines changed: 52 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,65 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
12
; RUN: llc < %s -mtriple=aarch64-windows | FileCheck %s
23

34
define dso_local i32 @func(ptr %g, i32 %a) "sign-return-address"="non-leaf" "sign-return-address-key"="b_key" {
5+
; CHECK-LABEL: func:
6+
; CHECK: .seh_proc func
7+
; CHECK-NEXT: // %bb.0: // %entry
8+
; CHECK-NEXT: hint #27
9+
; CHECK-NEXT: .seh_pac_sign_lr
10+
; CHECK-NEXT: sub sp, sp, #16
11+
; CHECK-NEXT: .seh_stackalloc 16
12+
; CHECK-NEXT: stp x19, x30, [sp] // 16-byte Folded Spill
13+
; CHECK-NEXT: .seh_save_lrpair x19, 0
14+
; CHECK-NEXT: .seh_endprologue
15+
; CHECK-NEXT: mov w19, w1
16+
; CHECK-NEXT: blr x0
17+
; CHECK-NEXT: mov w0, w19
18+
; CHECK-NEXT: .seh_startepilogue
19+
; CHECK-NEXT: ldp x19, x30, [sp] // 16-byte Folded Reload
20+
; CHECK-NEXT: .seh_save_lrpair x19, 0
21+
; CHECK-NEXT: add sp, sp, #16
22+
; CHECK-NEXT: .seh_stackalloc 16
23+
; CHECK-NEXT: hint #31
24+
; CHECK-NEXT: .seh_pac_sign_lr
25+
; CHECK-NEXT: .seh_endepilogue
26+
; CHECK-NEXT: ret
27+
; CHECK-NEXT: .seh_endfunclet
28+
; CHECK-NEXT: .seh_endproc
429
entry:
530
tail call void %g() #2
631
ret i32 %a
732
}
833

34+
;; For func2, check that the potentially folded autibsp+ret -> retab
35+
;; is handled correctly - currently we inhibit producing retab here.
36+
937
define dso_local i32 @func2(ptr %g, i32 %a) "sign-return-address"="non-leaf" "sign-return-address-key"="b_key" "target-features"="+v8.3a" {
38+
; CHECK-LABEL: func2:
39+
; CHECK: .seh_proc func2
40+
; CHECK-NEXT: // %bb.0: // %entry
41+
; CHECK-NEXT: pacibsp
42+
; CHECK-NEXT: .seh_pac_sign_lr
43+
; CHECK-NEXT: sub sp, sp, #16
44+
; CHECK-NEXT: .seh_stackalloc 16
45+
; CHECK-NEXT: stp x19, x30, [sp] // 16-byte Folded Spill
46+
; CHECK-NEXT: .seh_save_lrpair x19, 0
47+
; CHECK-NEXT: .seh_endprologue
48+
; CHECK-NEXT: mov w19, w1
49+
; CHECK-NEXT: blr x0
50+
; CHECK-NEXT: mov w0, w19
51+
; CHECK-NEXT: .seh_startepilogue
52+
; CHECK-NEXT: ldp x19, x30, [sp] // 16-byte Folded Reload
53+
; CHECK-NEXT: .seh_save_lrpair x19, 0
54+
; CHECK-NEXT: add sp, sp, #16
55+
; CHECK-NEXT: .seh_stackalloc 16
56+
; CHECK-NEXT: autibsp
57+
; CHECK-NEXT: .seh_pac_sign_lr
58+
; CHECK-NEXT: .seh_endepilogue
59+
; CHECK-NEXT: ret
60+
; CHECK-NEXT: .seh_endfunclet
61+
; CHECK-NEXT: .seh_endproc
1062
entry:
1163
tail call void %g() #2
1264
ret i32 %a
1365
}
14-
15-
16-
; CHECK-LABEL: func:
17-
; CHECK-NEXT: .seh_proc func
18-
; CHECK-NEXT: // %bb.0:
19-
; CHECK-NEXT: hint #27
20-
; CHECK-NEXT: .seh_pac_sign_lr
21-
; CHECK-NEXT: str x19, [sp, #-16]!
22-
; CHECK-NEXT: .seh_save_reg_x x19, 16
23-
; CHECK-NEXT: str x30, [sp, #8]
24-
; CHECK-NEXT: .seh_save_reg x30, 8
25-
; CHECK-NEXT: .seh_endprologue
26-
27-
; CHECK: .seh_startepilogue
28-
; CHECK-NEXT: ldr x30, [sp, #8]
29-
; CHECK-NEXT: .seh_save_reg x30, 8
30-
; CHECK-NEXT: ldr x19, [sp], #16
31-
; CHECK-NEXT: .seh_save_reg_x x19, 16
32-
; CHECK-NEXT: hint #31
33-
; CHECK-NEXT: .seh_pac_sign_lr
34-
; CHECK-NEXT: .seh_endepilogue
35-
; CHECK-NEXT: ret
36-
; CHECK-NEXT: .seh_endfunclet
37-
; CHECK-NEXT: .seh_endproc
38-
39-
;; For func2, check that the potentially folded autibsp+ret -> retab
40-
;; is handled correctly - currently we inhibit producing retab here.
41-
42-
; CHECK-LABEL: func2:
43-
; CHECK-NEXT: .seh_proc func2
44-
; CHECK-NEXT: // %bb.0:
45-
; CHECK-NEXT: pacibsp
46-
; CHECK-NEXT: .seh_pac_sign_lr
47-
; CHECK-NEXT: str x19, [sp, #-16]!
48-
; CHECK-NEXT: .seh_save_reg_x x19, 16
49-
; CHECK-NEXT: str x30, [sp, #8]
50-
; CHECK-NEXT: .seh_save_reg x30, 8
51-
; CHECK-NEXT: .seh_endprologue
52-
53-
; CHECK: .seh_startepilogue
54-
; CHECK-NEXT: ldr x30, [sp, #8]
55-
; CHECK-NEXT: .seh_save_reg x30, 8
56-
; CHECK-NEXT: ldr x19, [sp], #16
57-
; CHECK-NEXT: .seh_save_reg_x x19, 16
58-
; CHECK-NEXT: autibsp
59-
; CHECK-NEXT: .seh_pac_sign_lr
60-
; CHECK-NEXT: .seh_endepilogue
61-
; CHECK-NEXT: ret
62-
; CHECK-NEXT: .seh_endfunclet
63-
; CHECK-NEXT: .seh_endproc

llvm/test/CodeGen/AArch64/wineh-save-lrpair2.mir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
22
# RUN: -stop-after=prologepilog | FileCheck %s
33

4-
# Check that lr isn't paired with a GPR if it's the first pair, as
5-
# that can't be described as a SEH opcode if combined with predecrement.
4+
# Check that when LR is paired with a GPR, we don't combine it into a
5+
# predecrement that can't be described as a SEH opcode.
66

7-
# CHECK: early-clobber $sp = frame-setup STRXpre killed $x19, $sp, -16
8-
# CHECK-NEXT: frame-setup SEH_SaveReg_X 19, -16
9-
# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 1
10-
# CHECK-NEXT: frame-setup SEH_SaveReg 30, 8
7+
# CHECK: $sp = frame-setup SUBXri $sp, 16, 0
8+
# CHECK-NEXT: frame-setup SEH_StackAlloc 16
9+
# CHECK-NEXT: frame-setup STPXi killed $x19, killed $lr, $sp, 0
10+
# CHECK-NEXT: frame-setup SEH_SaveRegP 19, 30, 0
1111
# CHECK-NEXT: frame-setup SEH_PrologEnd
1212

1313
--- |

0 commit comments

Comments
 (0)