- 
                Notifications
    You must be signed in to change notification settings 
- Fork 15k
[AArch64] Enable GlobalMerge on externals #158592
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
GlobalMerge has been enabled for minsize for a while, this patch enables it more generally. In my testing it did not affect performance very much, especially with the linker relaxations we already perform, but should help reduce code size a little.
| @llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesGlobalMerge has been enabled for minsize for a while, this patch enables it more generally. In my testing it did not affect performance very much, especially with the linker relaxations we already perform, but should help reduce code size a little. Full diff: https://github.com/llvm/llvm-project/pull/158592.diff 4 Files Affected: 
 diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index dde1d88403bfe..5b80b08375f8c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -694,12 +694,6 @@ bool AArch64PassConfig::addPreISel() {
     // is disabled as we emit the .subsections_via_symbols directive which
     // means that merging extern globals is not safe.
     bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO();
-
-    // FIXME: extern global merging is only enabled when we optimise for size
-    // because there are some regressions with it also enabled for performance.
-    if (!OnlyOptimizeForSize)
-      MergeExternalByDefault = false;
-
     addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize,
                                   MergeExternalByDefault));
   }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll b/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll
index f37c942ab950c..dd0152c3f4c08 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll
@@ -35,24 +35,24 @@ define dso_local void @testcase(ptr nocapture %arg){
 ;
 ; CHECK-O3-LABEL: testcase:
 ; CHECK-O3:       // %bb.0: // %entry
-; CHECK-O3-NEXT:    adrp x8, global_ptr
-; CHECK-O3-NEXT:    ldr x9, [x8, :lo12:global_ptr]
+; CHECK-O3-NEXT:    adrp x8, .L_MergedGlobals+8
+; CHECK-O3-NEXT:    ldr x9, [x8, :lo12:.L_MergedGlobals+8]
 ; CHECK-O3-NEXT:    cbz x9, .LBB0_2
 ; CHECK-O3-NEXT:  // %bb.1: // %if.then
 ; CHECK-O3-NEXT:    ldr x9, [x9]
 ; CHECK-O3-NEXT:    str x9, [x0]
-; CHECK-O3-NEXT:    ldr x8, [x8, :lo12:global_ptr]
-; CHECK-O3-NEXT:    adrp x9, global_int
+; CHECK-O3-NEXT:    ldr x8, [x8, :lo12:.L_MergedGlobals+8]
+; CHECK-O3-NEXT:    adrp x9, .L_MergedGlobals
 ; CHECK-O3-NEXT:    add x2, x8, #16
 ; CHECK-O3-NEXT:    mov w0, #10 // =0xa
-; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:global_int]
+; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:.L_MergedGlobals]
 ; CHECK-O3-NEXT:    b externalfunc
 ; CHECK-O3-NEXT:  .LBB0_2:
 ; CHECK-O3-NEXT:    mov x8, xzr
-; CHECK-O3-NEXT:    adrp x9, global_int
+; CHECK-O3-NEXT:    adrp x9, .L_MergedGlobals
 ; CHECK-O3-NEXT:    add x2, x8, #16
 ; CHECK-O3-NEXT:    mov w0, #10 // =0xa
-; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:global_int]
+; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:.L_MergedGlobals]
 ; CHECK-O3-NEXT:    b externalfunc
 ;
 ; CHECK-O2-6-LABEL: testcase:
diff --git a/llvm/test/CodeGen/AArch64/global-merge-external.ll b/llvm/test/CodeGen/AArch64/global-merge-external.ll
index fb3753c54e0ca..7f2b1e38ea4fc 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-external.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-external.ll
@@ -9,14 +9,22 @@ target triple = "aarch64"
 @global1 = dso_local local_unnamed_addr global i32 0, align 4
 
 define dso_local i32 @func() {
-; CHECK-LABEL: func:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x8, global0
-; CHECK-NEXT:    adrp x9, global1
-; CHECK-NEXT:    ldr w8, [x8, :lo12:global0]
-; CHECK-NEXT:    ldr w9, [x9, :lo12:global1]
-; CHECK-NEXT:    add w0, w9, w8
-; CHECK-NEXT:    ret
+; CHECK-O2-LABEL: func:
+; CHECK-O2:       // %bb.0: // %entry
+; CHECK-O2-NEXT:    adrp x8, global0
+; CHECK-O2-NEXT:    adrp x9, global1
+; CHECK-O2-NEXT:    ldr w8, [x8, :lo12:global0]
+; CHECK-O2-NEXT:    ldr w9, [x9, :lo12:global1]
+; CHECK-O2-NEXT:    add w0, w9, w8
+; CHECK-O2-NEXT:    ret
+;
+; CHECK-O3-LABEL: func:
+; CHECK-O3:       // %bb.0: // %entry
+; CHECK-O3-NEXT:    adrp x8, .L_MergedGlobals
+; CHECK-O3-NEXT:    add x8, x8, :lo12:.L_MergedGlobals
+; CHECK-O3-NEXT:    ldp w9, w8, [x8]
+; CHECK-O3-NEXT:    add w0, w8, w9
+; CHECK-O3-NEXT:    ret
 entry:
   %0 = load i32, ptr @global0, align 4
   %1 = load i32, ptr @global1, align 4
@@ -24,5 +32,4 @@ entry:
   ret i32 %add
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-O2: {{.*}}
-; CHECK-O3: {{.*}}
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
index caf6f1a83f762..1207eaa2612a3 100644
--- a/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
+++ b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
@@ -26,8 +26,8 @@ define dso_local void @f8(i32 noundef %i, i32 noundef %k) #0 {
 ; CHECK-ASM-NEXT:    cbz x9, .LBB0_5
 ; CHECK-ASM-NEXT:  // %bb.2:
 ; CHECK-ASM-NEXT:    ldrsw x9, [sp, #8]
-; CHECK-ASM-NEXT:    adrp x10, B
-; CHECK-ASM-NEXT:    add x10, x10, :lo12:B
+; CHECK-ASM-NEXT:    adrp x10, .L_MergedGlobals
+; CHECK-ASM-NEXT:    add x10, x10, :lo12:.L_MergedGlobals
 ; CHECK-ASM-NEXT:    strb wzr, [x10, x8]
 ; CHECK-ASM-NEXT:    cmp x9, #10
 ; CHECK-ASM-NEXT:    b.hi .LBB0_6
@@ -36,9 +36,8 @@ define dso_local void @f8(i32 noundef %i, i32 noundef %k) #0 {
 ; CHECK-ASM-NEXT:    sub x8, x8, x9
 ; CHECK-ASM-NEXT:    cbz x8, .LBB0_6
 ; CHECK-ASM-NEXT:  // %bb.4:
-; CHECK-ASM-NEXT:    adrp x8, B2
-; CHECK-ASM-NEXT:    add x8, x8, :lo12:B2
-; CHECK-ASM-NEXT:    strb wzr, [x8, x9]
+; CHECK-ASM-NEXT:    add x8, x10, x9
+; CHECK-ASM-NEXT:    strb wzr, [x8, #10]
 ; CHECK-ASM-NEXT:    add sp, sp, #16
 ; CHECK-ASM-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-ASM-NEXT:    ret
 | 
| bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO(); | ||
|  | ||
| // FIXME: extern global merging is only enabled when we optimise for size | ||
| // because there are some regressions with it also enabled for performance. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was going to ask about these "some regressions", but did a git blame first and noticed I contributed this, so yes, that's funny and I can't remember what they were. Does this ring a bell to you, @davemgreen ? If I had to take a guess, then I did some measurements on smaller A-cores, i.e. not the wide X or Neoverse cores we have today. You did mention that it didn't affect perf very much. I am fine with the change, but my only question out of curiousity is which cores you've tested.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://reviews.llvm.org/D61947 mentions one small regression in spec2000, but it was apparently ran with -Oz.
I haven't seen any worrying performance results myself, either on server class cpus or models.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cheers, LGTM.
GlobalMerge has been enabled for minsize for a while, this patch enables it more generally. In my testing it did not affect performance very much, especially with the linker relaxations we already perform, but should help reduce code size a little.