-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[X86][APX] Prevent from emitting push2/pop2 if stack alignment<16B #143076
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
… bytes push2/pop2 requires 16 bytes stack alignment. If the stack alignment is less than that, push2/pop2 should not be emitted.
|
@llvm/pr-subscribers-backend-x86 Author: Feng Zou (fzou1) Changespush2/pop2 requires 16-byte stack alignment. If the stack alignment is less than that, push2/pop2 should not be emitted. Full diff: https://github.com/llvm/llvm-project/pull/143076.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 75f49beee27c6..09b036b5f0c77 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2921,7 +2921,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
// 3. When the number of CSR push is even, start to use push2 from the 1st
// push and make the stack 16B aligned before the push
unsigned NumRegsForPush2 = 0;
- if (STI.hasPush2Pop2()) {
+ if (STI.hasPush2Pop2() && getStackAlignment() >= 16) {
unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) {
return X86::GR64RegClass.contains(I.getReg());
});
diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll
new file mode 100644
index 0000000000000..d72c0bbcbcc7d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2 | FileCheck %s --check-prefix=CHECK
+
+; This test is used to check no push2/pop2 emitted if stack alignment is set to
+; the value less than 16 bytes required by push2/pop2 instruction. Here it's set
+; to 8 bytes.
+
+define void @csr1() nounwind {
+; CHECK-LABEL: csr1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr2() nounwind {
+; CHECK-LABEL: csr2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr3() nounwind {
+; CHECK-LABEL: csr3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr4() nounwind {
+; CHECK-LABEL: csr4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr5() nounwind {
+; CHECK-LABEL: csr5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+define void @csr6() nounwind {
+; CHECK-LABEL: csr6:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{rbx},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+
+define void @lea_in_epilog(i1 %arg, ptr %arg1, ptr %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10) nounwind {
+; CHECK-LABEL: lea_in_epilog:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: je .LBB6_5
+; CHECK-NEXT: # %bb.1: # %bb13
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $16, %rsp
+; CHECK-NEXT: movq %r9, %r14
+; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r14
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; CHECK-NEXT: addq %r14, %r13
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; CHECK-NEXT: addq %r14, %r15
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; CHECK-NEXT: addq %r14, %rbx
+; CHECK-NEXT: xorl %ebp, %ebp
+; CHECK-NEXT: xorl %r12d, %r12d
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .LBB6_2: # %bb15
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: incq %r12
+; CHECK-NEXT: movl $432, %edx # imm = 0x1B0
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: movq %r15, %rsi
+; CHECK-NEXT: callq memcpy@PLT
+; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: addq %rax, %r13
+; CHECK-NEXT: addq %rax, %r15
+; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rax, %r14
+; CHECK-NEXT: addq $8, %rbp
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: je .LBB6_2
+; CHECK-NEXT: # %bb.3: # %bb11
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: jne .LBB6_5
+; CHECK-NEXT: # %bb.4: # %bb12
+; CHECK-NEXT: movq $0, (%rax)
+; CHECK-NEXT: .LBB6_5: # %bb14
+; CHECK-NEXT: retq
+bb:
+ br i1 %arg, label %bb13, label %bb14
+
+bb11:
+ br i1 %arg, label %bb14, label %bb12
+
+bb12:
+ store double 0.000000e+00, ptr %arg1, align 8
+ br label %bb14
+
+bb13:
+ %getelementptr = getelementptr i8, ptr null, i64 %arg5
+ br label %bb15
+
+bb14:
+ ret void
+
+bb15:
+ %phi = phi i64 [ 0, %bb13 ], [ %add, %bb15 ]
+ %getelementptr16 = getelementptr double, ptr null, i64 %phi
+ %add = add i64 %phi, 1
+ %mul = mul i64 %arg6, %add
+ %getelementptr17 = getelementptr i8, ptr %getelementptr, i64 %mul
+ call void @llvm.memcpy.p0.p0.i64(ptr %getelementptr16, ptr %getelementptr17, i64 0, i1 false)
+ %getelementptr18 = getelementptr i8, ptr %getelementptr17, i64 %arg7
+ %getelementptr19 = getelementptr i8, ptr %getelementptr17, i64 %arg8
+ call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr19, i64 0, i1 false)
+ %getelementptr20 = getelementptr i8, ptr %getelementptr17, i64 %arg9
+ call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr20, i64 432, i1 false)
+ %getelementptr21 = getelementptr i8, ptr %getelementptr17, i64 %arg10
+ call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr21, i64 0, i1 false)
+ br i1 %arg, label %bb11, label %bb15
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"override-stack-alignment", i32 8}
|
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LTGM with one comment.
|
|
||
| declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) | ||
|
|
||
| define void @lea_in_epilog(i1 %arg, ptr %arg1, ptr %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10) nounwind { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we need such complex test here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated.
|
Why don't set the alignment to 16B in this case? |
| // push and make the stack 16B aligned before the push | ||
| unsigned NumRegsForPush2 = 0; | ||
| if (STI.hasPush2Pop2()) { | ||
| if (STI.hasPush2Pop2() && getStackAlignment() >= 16) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this solution is better, you should mention the alignment requirement in above comments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated.
There's an option to override the alignment. We should respect customer's selection. |
We respected , 16B align is also 8B align |
The 8B means OS can only guarantee stack is 8B aligned. In other word, it doesn't guarantee stack is always aligned to 16B under some special circumstances. |
…lvm#143076) push2/pop2 requires 16-byte stack alignment. If the stack alignment is less than that, push2/pop2 should not be emitted. It triggers general protection exception if the data being pushed/popped by push2/pop2 is not 16-byte aligned on the stack.
push2/pop2 requires 16-byte stack alignment. If the stack alignment is less than that, push2/pop2 should not be emitted. It triggers general protection exception if the data being pushed/popped by push2/pop2 is not 16-byte aligned on the stack.