From 4c71a9845349753b8e8454452c04a4a9b5331424 Mon Sep 17 00:00:00 2001 From: Feng Zou Date: Fri, 6 Jun 2025 12:48:50 +0800 Subject: [PATCH 1/2] [X86][APX] Prevent from emitting push2/pop2 when stack alignment < 16 bytes push2/pop2 requires 16 bytes stack alignment. If the stack alignment is less than that, push2/pop2 should not be emitted. --- llvm/lib/Target/X86/X86FrameLowering.cpp | 2 +- ...op2-disabled-with-small-stack-alignment.ll | 209 ++++++++++++++++++ 2 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 75f49beee27c6..09b036b5f0c77 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2921,7 +2921,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // 3. When the number of CSR push is even, start to use push2 from the 1st // push and make the stack 16B aligned before the push unsigned NumRegsForPush2 = 0; - if (STI.hasPush2Pop2()) { + if (STI.hasPush2Pop2() && getStackAlignment() >= 16) { unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) { return X86::GR64RegClass.contains(I.getReg()); }); diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll new file mode 100644 index 0000000000000..d72c0bbcbcc7d --- /dev/null +++ b/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2 | FileCheck %s --check-prefix=CHECK + +; This test is used to check no push2/pop2 emitted if stack alignment is set to +; the value less than 16 bytes required by push2/pop2 instruction. Here it's set +; to 8 bytes. + +define void @csr1() nounwind { +; CHECK-LABEL: csr1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +define void @csr2() nounwind { +; CHECK-LABEL: csr2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +define void @csr3() nounwind { +; CHECK-LABEL: csr3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +define void @csr4() nounwind { +; CHECK-LABEL: csr4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +define void @csr5() nounwind { +; CHECK-LABEL: csr5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +define void @csr6() nounwind { +; CHECK-LABEL: csr6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{rbx},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) + +define void @lea_in_epilog(i1 %arg, ptr %arg1, ptr %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10) nounwind { +; CHECK-LABEL: lea_in_epilog: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB6_5 +; CHECK-NEXT: # %bb.1: # %bb13 +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $16, %rsp +; CHECK-NEXT: movq %r9, %r14 +; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r14 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; CHECK-NEXT: addq %r14, %r13 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; CHECK-NEXT: addq %r14, %r15 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; CHECK-NEXT: addq %r14, %rbx +; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: xorl %r12d, %r12d +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB6_2: # %bb15 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: incq %r12 +; CHECK-NEXT: movl $432, %edx # imm = 0x1B0 +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: movq %r15, %rsi +; CHECK-NEXT: callq memcpy@PLT +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: addq %rax, %r13 +; CHECK-NEXT: addq %rax, %r15 +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r14 +; CHECK-NEXT: addq $8, %rbp +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB6_2 +; CHECK-NEXT: # %bb.3: # %bb11 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: jne .LBB6_5 +; CHECK-NEXT: # %bb.4: # %bb12 +; CHECK-NEXT: movq $0, (%rax) +; CHECK-NEXT: .LBB6_5: # %bb14 +; CHECK-NEXT: retq +bb: + br i1 %arg, label %bb13, label %bb14 + +bb11: + br i1 %arg, label %bb14, label %bb12 + +bb12: + store double 0.000000e+00, ptr %arg1, align 8 + br label %bb14 + +bb13: + %getelementptr = getelementptr i8, ptr null, i64 %arg5 + br label %bb15 + +bb14: + ret void + +bb15: + %phi = phi i64 [ 0, %bb13 ], [ %add, %bb15 ] + %getelementptr16 = getelementptr double, ptr null, i64 %phi + %add = add i64 %phi, 1 + %mul = mul i64 %arg6, %add + %getelementptr17 = getelementptr i8, ptr %getelementptr, i64 %mul + call void @llvm.memcpy.p0.p0.i64(ptr %getelementptr16, ptr %getelementptr17, i64 0, i1 false) + %getelementptr18 = getelementptr i8, ptr %getelementptr17, i64 %arg7 + %getelementptr19 = getelementptr i8, ptr %getelementptr17, i64 %arg8 + call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr19, i64 0, i1 false) + %getelementptr20 = getelementptr i8, ptr %getelementptr17, i64 %arg9 + call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr20, i64 432, i1 false) + %getelementptr21 = getelementptr i8, ptr %getelementptr17, i64 %arg10 + call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr21, i64 0, i1 false) + br i1 %arg, label %bb11, label %bb15 +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"override-stack-alignment", i32 8} From 0f8e848ccf3bd1712dc8bf43ab5cd482ca0baccc Mon Sep 17 00:00:00 2001 From: Feng Zou Date: Fri, 6 Jun 2025 15:01:47 +0800 Subject: [PATCH 2/2] Update comments and remove the complicated sub test. --- llvm/lib/Target/X86/X86FrameLowering.cpp | 1 + llvm/lib/Target/X86/X86MachineFunctionInfo.h | 2 +- ...op2-disabled-with-small-stack-alignment.ll | 92 ------------------- 3 files changed, 2 insertions(+), 93 deletions(-) diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 09b036b5f0c77..42a09106b3a3a 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2915,6 +2915,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // 1. Use push2 when // a) number of CSR > 1 if no need padding // b) number of CSR > 2 if need padding + // c) stack alignment >= 16 bytes // 2. When the number of CSR push is odd // a. Start to use push2 from the 1st push if stack is 16B aligned. // b. Start to use push2 from the 2nd push if stack is not 16B aligned. diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 24371369d4a45..5f974e5de9a19 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -149,7 +149,7 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// other tools to detect the extended record. bool HasSwiftAsyncContext = false; - /// Ajust stack for push2/pop2 + /// Adjust stack for push2/pop2 bool PadForPush2Pop2 = false; /// Candidate registers for push2/pop2 diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll index d72c0bbcbcc7d..f07e40b442c04 100644 --- a/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll +++ b/llvm/test/CodeGen/X86/apx/push2-pop2-disabled-with-small-stack-alignment.ll @@ -113,97 +113,5 @@ entry: ret void } -declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) - -define void @lea_in_epilog(i1 %arg, ptr %arg1, ptr %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10) nounwind { -; CHECK-LABEL: lea_in_epilog: -; CHECK: # %bb.0: # %bb -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB6_5 -; CHECK-NEXT: # %bb.1: # %bb13 -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $16, %rsp -; CHECK-NEXT: movq %r9, %r14 -; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r14 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; CHECK-NEXT: addq %r14, %r13 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; CHECK-NEXT: addq %r14, %r15 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; CHECK-NEXT: addq %r14, %rbx -; CHECK-NEXT: xorl %ebp, %ebp -; CHECK-NEXT: xorl %r12d, %r12d -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB6_2: # %bb15 -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: incq %r12 -; CHECK-NEXT: movl $432, %edx # imm = 0x1B0 -; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: movq %r15, %rsi -; CHECK-NEXT: callq memcpy@PLT -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: addq %rax, %r13 -; CHECK-NEXT: addq %rax, %r15 -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: addq %rax, %r14 -; CHECK-NEXT: addq $8, %rbp -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB6_2 -; CHECK-NEXT: # %bb.3: # %bb11 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: jne .LBB6_5 -; CHECK-NEXT: # %bb.4: # %bb12 -; CHECK-NEXT: movq $0, (%rax) -; CHECK-NEXT: .LBB6_5: # %bb14 -; CHECK-NEXT: retq -bb: - br i1 %arg, label %bb13, label %bb14 - -bb11: - br i1 %arg, label %bb14, label %bb12 - -bb12: - store double 0.000000e+00, ptr %arg1, align 8 - br label %bb14 - -bb13: - %getelementptr = getelementptr i8, ptr null, i64 %arg5 - br label %bb15 - -bb14: - ret void - -bb15: - %phi = phi i64 [ 0, %bb13 ], [ %add, %bb15 ] - %getelementptr16 = getelementptr double, ptr null, i64 %phi - %add = add i64 %phi, 1 - %mul = mul i64 %arg6, %add - %getelementptr17 = getelementptr i8, ptr %getelementptr, i64 %mul - call void @llvm.memcpy.p0.p0.i64(ptr %getelementptr16, ptr %getelementptr17, i64 0, i1 false) - %getelementptr18 = getelementptr i8, ptr %getelementptr17, i64 %arg7 - %getelementptr19 = getelementptr i8, ptr %getelementptr17, i64 %arg8 - call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr19, i64 0, i1 false) - %getelementptr20 = getelementptr i8, ptr %getelementptr17, i64 %arg9 - call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr20, i64 432, i1 false) - %getelementptr21 = getelementptr i8, ptr %getelementptr17, i64 %arg10 - call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr21, i64 0, i1 false) - br i1 %arg, label %bb11, label %bb15 -} - !llvm.module.flags = !{!0} !0 = !{i32 1, !"override-stack-alignment", i32 8}