Skip to content

Conversation

davemgreen
Copy link
Collaborator

GlobalMerge has been enabled for minsize for a while, this patch enables it more generally. In my testing it did not affect performance very much, especially with the linker relaxations we already perform, but should help reduce code size a little.

GlobalMerge has been enabled for minsize for a while, this patch enables it
more generally. In my testing it did not affect performance very much,
especially with the linker relaxations we already perform, but should help
reduce code size a little.
@llvmbot
Copy link
Member

llvmbot commented Sep 15, 2025

@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

Changes

GlobalMerge has been enabled for minsize for a while, this patch enables it more generally. In my testing it did not affect performance very much, especially with the linker relaxations we already perform, but should help reduce code size a little.


Full diff: https://github.com/llvm/llvm-project/pull/158592.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64TargetMachine.cpp (-6)
  • (modified) llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll (+7-7)
  • (modified) llvm/test/CodeGen/AArch64/global-merge-external.ll (+17-10)
  • (modified) llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll (+4-5)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index dde1d88403bfe..5b80b08375f8c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -694,12 +694,6 @@ bool AArch64PassConfig::addPreISel() {
     // is disabled as we emit the .subsections_via_symbols directive which
     // means that merging extern globals is not safe.
     bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO();
-
-    // FIXME: extern global merging is only enabled when we optimise for size
-    // because there are some regressions with it also enabled for performance.
-    if (!OnlyOptimizeForSize)
-      MergeExternalByDefault = false;
-
     addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize,
                                   MergeExternalByDefault));
   }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll b/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll
index f37c942ab950c..dd0152c3f4c08 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll
@@ -35,24 +35,24 @@ define dso_local void @testcase(ptr nocapture %arg){
 ;
 ; CHECK-O3-LABEL: testcase:
 ; CHECK-O3:       // %bb.0: // %entry
-; CHECK-O3-NEXT:    adrp x8, global_ptr
-; CHECK-O3-NEXT:    ldr x9, [x8, :lo12:global_ptr]
+; CHECK-O3-NEXT:    adrp x8, .L_MergedGlobals+8
+; CHECK-O3-NEXT:    ldr x9, [x8, :lo12:.L_MergedGlobals+8]
 ; CHECK-O3-NEXT:    cbz x9, .LBB0_2
 ; CHECK-O3-NEXT:  // %bb.1: // %if.then
 ; CHECK-O3-NEXT:    ldr x9, [x9]
 ; CHECK-O3-NEXT:    str x9, [x0]
-; CHECK-O3-NEXT:    ldr x8, [x8, :lo12:global_ptr]
-; CHECK-O3-NEXT:    adrp x9, global_int
+; CHECK-O3-NEXT:    ldr x8, [x8, :lo12:.L_MergedGlobals+8]
+; CHECK-O3-NEXT:    adrp x9, .L_MergedGlobals
 ; CHECK-O3-NEXT:    add x2, x8, #16
 ; CHECK-O3-NEXT:    mov w0, #10 // =0xa
-; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:global_int]
+; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:.L_MergedGlobals]
 ; CHECK-O3-NEXT:    b externalfunc
 ; CHECK-O3-NEXT:  .LBB0_2:
 ; CHECK-O3-NEXT:    mov x8, xzr
-; CHECK-O3-NEXT:    adrp x9, global_int
+; CHECK-O3-NEXT:    adrp x9, .L_MergedGlobals
 ; CHECK-O3-NEXT:    add x2, x8, #16
 ; CHECK-O3-NEXT:    mov w0, #10 // =0xa
-; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:global_int]
+; CHECK-O3-NEXT:    ldr w1, [x9, :lo12:.L_MergedGlobals]
 ; CHECK-O3-NEXT:    b externalfunc
 ;
 ; CHECK-O2-6-LABEL: testcase:
diff --git a/llvm/test/CodeGen/AArch64/global-merge-external.ll b/llvm/test/CodeGen/AArch64/global-merge-external.ll
index fb3753c54e0ca..7f2b1e38ea4fc 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-external.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-external.ll
@@ -9,14 +9,22 @@ target triple = "aarch64"
 @global1 = dso_local local_unnamed_addr global i32 0, align 4
 
 define dso_local i32 @func() {
-; CHECK-LABEL: func:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x8, global0
-; CHECK-NEXT:    adrp x9, global1
-; CHECK-NEXT:    ldr w8, [x8, :lo12:global0]
-; CHECK-NEXT:    ldr w9, [x9, :lo12:global1]
-; CHECK-NEXT:    add w0, w9, w8
-; CHECK-NEXT:    ret
+; CHECK-O2-LABEL: func:
+; CHECK-O2:       // %bb.0: // %entry
+; CHECK-O2-NEXT:    adrp x8, global0
+; CHECK-O2-NEXT:    adrp x9, global1
+; CHECK-O2-NEXT:    ldr w8, [x8, :lo12:global0]
+; CHECK-O2-NEXT:    ldr w9, [x9, :lo12:global1]
+; CHECK-O2-NEXT:    add w0, w9, w8
+; CHECK-O2-NEXT:    ret
+;
+; CHECK-O3-LABEL: func:
+; CHECK-O3:       // %bb.0: // %entry
+; CHECK-O3-NEXT:    adrp x8, .L_MergedGlobals
+; CHECK-O3-NEXT:    add x8, x8, :lo12:.L_MergedGlobals
+; CHECK-O3-NEXT:    ldp w9, w8, [x8]
+; CHECK-O3-NEXT:    add w0, w8, w9
+; CHECK-O3-NEXT:    ret
 entry:
   %0 = load i32, ptr @global0, align 4
   %1 = load i32, ptr @global1, align 4
@@ -24,5 +32,4 @@ entry:
   ret i32 %add
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-O2: {{.*}}
-; CHECK-O3: {{.*}}
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
index caf6f1a83f762..1207eaa2612a3 100644
--- a/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
+++ b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
@@ -26,8 +26,8 @@ define dso_local void @f8(i32 noundef %i, i32 noundef %k) #0 {
 ; CHECK-ASM-NEXT:    cbz x9, .LBB0_5
 ; CHECK-ASM-NEXT:  // %bb.2:
 ; CHECK-ASM-NEXT:    ldrsw x9, [sp, #8]
-; CHECK-ASM-NEXT:    adrp x10, B
-; CHECK-ASM-NEXT:    add x10, x10, :lo12:B
+; CHECK-ASM-NEXT:    adrp x10, .L_MergedGlobals
+; CHECK-ASM-NEXT:    add x10, x10, :lo12:.L_MergedGlobals
 ; CHECK-ASM-NEXT:    strb wzr, [x10, x8]
 ; CHECK-ASM-NEXT:    cmp x9, #10
 ; CHECK-ASM-NEXT:    b.hi .LBB0_6
@@ -36,9 +36,8 @@ define dso_local void @f8(i32 noundef %i, i32 noundef %k) #0 {
 ; CHECK-ASM-NEXT:    sub x8, x8, x9
 ; CHECK-ASM-NEXT:    cbz x8, .LBB0_6
 ; CHECK-ASM-NEXT:  // %bb.4:
-; CHECK-ASM-NEXT:    adrp x8, B2
-; CHECK-ASM-NEXT:    add x8, x8, :lo12:B2
-; CHECK-ASM-NEXT:    strb wzr, [x8, x9]
+; CHECK-ASM-NEXT:    add x8, x10, x9
+; CHECK-ASM-NEXT:    strb wzr, [x8, #10]
 ; CHECK-ASM-NEXT:    add sp, sp, #16
 ; CHECK-ASM-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-ASM-NEXT:    ret

bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO();

// FIXME: extern global merging is only enabled when we optimise for size
// because there are some regressions with it also enabled for performance.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was going to ask about these "some regressions", but did a git blame first and noticed I contributed this, so yes, that's funny and I can't remember what they were. Does this ring a bell to you, @davemgreen ? If I had to take a guess, then I did some measurements on smaller A-cores, i.e. not the wide X or Neoverse cores we have today. You did mention that it didn't affect perf very much. I am fine with the change, but my only question out of curiousity is which cores you've tested.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://reviews.llvm.org/D61947 mentions one small regression in spec2000, but it was apparently ran with -Oz.

I haven't seen any worrying performance results myself, either on server class cpus or models.

Copy link
Collaborator

@sjoerdmeijer sjoerdmeijer left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cheers, LGTM.

@davemgreen davemgreen merged commit a50c11a into llvm:main Sep 17, 2025
11 of 12 checks passed
@davemgreen davemgreen deleted the gh-a64-globalmerge branch September 17, 2025 16:49
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants