Skip to content

Conversation

@asb
Copy link
Contributor

@asb asb commented Nov 15, 2025

As pointed out in post-commit discussion on #167336 #167336 (comment), although the test case succeeds in showing a codegen difference now the faulty MachineCopyPropagation logic was removed, the example was reduced so much that it actually would have been legal to remove the seemingly redundant mov.

This is a re-reduction of that test case which should now demonstrate a mov that can't safely be removed (mov w9, w9) because the upper bits no longer being zeroed may alter the program logic.

As pointed out in post-commit discussion on llvm#167336
<llvm#167336 (comment)>,
although the test case succeeds in showing a codegen difference now the
faulty MachineCopyPropagation logic was removed, the example was reduced
so much that it actually would have been legal to remove the seemingly
redundant mov.

This is a re-reduction of that test case which should now demonstrate a
mov that can't safely be removed (mov w9, w9) because the upper bits no
longer being zeroed may alter the program logic.
@llvmbot
Copy link
Member

llvmbot commented Nov 15, 2025

@llvm/pr-subscribers-backend-aarch64

Author: Alex Bradbury (asb)

Changes

As pointed out in post-commit discussion on #167336 <#167336 (comment)>, although the test case succeeds in showing a codegen difference now the faulty MachineCopyPropagation logic was removed, the example was reduced so much that it actually would have been legal to remove the seemingly redundant mov.

This is a re-reduction of that test case which should now demonstrate a mov that can't safely be removed (mov w9, w9) because the upper bits no longer being zeroed may alter the program logic.


Full diff: https://github.com/llvm/llvm-project/pull/168194.diff

1 Files Affected:

  • (modified) llvm/test/CodeGen/AArch64/pr166870.ll (+77-46)
diff --git a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll
index dc23f51987635..d6f99c67a01ff 100644
--- a/llvm/test/CodeGen/AArch64/pr166870.ll
+++ b/llvm/test/CodeGen/AArch64/pr166870.ll
@@ -1,68 +1,99 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -O3 < %s -mtriple=aarch64 | FileCheck %s
 
-; The seemingly redundant mov where src_reg == dst_reg shouldn't be removed,
-; because it has the effect of zeroing the upper bits in x8.
+; The seemingly redundant wreg mov where src_reg == dst_reg shouldn't be
+; removed, because it has the effect of zeroing the upper bits in the matching
+; xreg.
 
-define i32 @ham(i32 %arg, i1 %arg1, i1 %arg2, ptr %arg3) nounwind {
-; CHECK-LABEL: ham:
+define i32 @widget(i32 %arg, i32 %arg1, i1 %arg2, ptr %arg3, i1 %arg4) #0 nounwind {
+; CHECK-LABEL: widget:
 ; CHECK:       // %bb.0: // %bb
-; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    tbnz w1, #0, .LBB0_3
-; CHECK-NEXT:  // %bb.1: // %bb4
-; CHECK-NEXT:    tbnz w2, #0, .LBB0_3
-; CHECK-NEXT:  // %bb.2: // %bb5
-; CHECK-NEXT:    mov x19, x3
-; CHECK-NEXT:    mov w21, w1
-; CHECK-NEXT:    mov w20, w0
-; CHECK-NEXT:    bl zot
-; CHECK-NEXT:    tbz w21, #0, .LBB0_4
-; CHECK-NEXT:  .LBB0_3: // %bb6
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    tbz w2, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: // %bb5
+; CHECK-NEXT:    tbz w4, #0, .LBB0_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    mov w0, #0 // =0x0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    mov w8, w20
-; CHECK-NEXT:    mov w20, wzr
+; CHECK-NEXT:  .LBB0_4: // %bb6
+; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x3
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    mov x21, x1
+; CHECK-NEXT:    bl baz
+; CHECK-NEXT:    mov w0, #0 // =0x0
+; CHECK-NEXT:    cbnz wzr, .LBB0_11
+; CHECK-NEXT:  // %bb.5: // %bb6
+; CHECK-NEXT:    mov w10, #1 // =0x1
+; CHECK-NEXT:    cbnz w10, .LBB0_11
+; CHECK-NEXT:  // %bb.6: // %bb7
+; CHECK-NEXT:    cbnz w10, .LBB0_10
+; CHECK-NEXT:  // %bb.7: // %bb8
+; CHECK-NEXT:    mov x8, x21
+; CHECK-NEXT:    mov x9, x20
+; CHECK-NEXT:    mov w20, #0 // =0x0
+; CHECK-NEXT:    mov w9, w9
+; CHECK-NEXT:    mov x21, x9
 ; CHECK-NEXT:    mov w8, w8
-; CHECK-NEXT:    mov w21, w8
-; CHECK-NEXT:  .LBB0_5: // %bb7
+; CHECK-NEXT:    mov x22, x8
+; CHECK-NEXT:  .LBB0_8: // %bb10
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    strb w20, [x19]
-; CHECK-NEXT:    cbnz x21, .LBB0_5
-; CHECK-NEXT:  // %bb.6: // %bb8
-; CHECK-NEXT:    // in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:    bl quux
-; CHECK-NEXT:    b .LBB0_5
+; CHECK-NEXT:    cbnz x21, .LBB0_8
+; CHECK-NEXT:  // %bb.9: // %bb12
+; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=1
+; CHECK-NEXT:    bl snork
+; CHECK-NEXT:    cbnz x22, .LBB0_8
+; CHECK-NEXT:  .LBB0_10:
+; CHECK-NEXT:    mov w0, #0 // =0x0
+; CHECK-NEXT:  .LBB0_11:
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 bb:
-  br i1 %arg1, label %bb6, label %bb4
-
-bb4:
-  %load = load ptr, ptr null, align 8
-  br i1 %arg2, label %bb6, label %bb5
+  br i1 %arg2, label %bb14, label %bb5
 
 bb5:
-  %call = call i32 @zot() #0
-  %zext = zext i32 %arg to i64
-  br i1 %arg1, label %bb6, label %bb7
+  %load = load ptr, ptr null, align 8
+  br i1 %arg4, label %bb14, label %bb6
 
 bb6:
-  ret i32 0
+  %call = call i32 @baz() #1
+  %or = or i1 false, true
+  br i1 %or, label %bb14, label %bb7
 
 bb7:
-  store i8 0, ptr %arg3, align 1
-  %icmp = icmp eq i64 %zext, 0
-  br i1 %icmp, label %bb8, label %bb7
+  %icmp = icmp eq i32 0, 0
+  %zext = zext i32 %arg to i64
+  br i1 %icmp, label %bb14, label %bb8
 
 bb8:
-  call void @quux()
-  br label %bb7
+  %zext9 = zext i32 %arg1 to i64
+  br label %bb10
+
+bb10:
+  store i8 0, ptr %arg3, align 1
+  %icmp11 = icmp eq i64 %zext, 0
+  br i1 %icmp11, label %bb12, label %bb10
+
+bb12:
+  call void @snork()
+  %icmp13 = icmp eq i64 0, %zext9
+  br i1 %icmp13, label %bb14, label %bb10
+
+bb14:
+  %phi = phi i32 [ 0, %bb6 ], [ 0, %bb7 ], [ 0, %bb12 ], [ 1, %bb ], [ 0, %bb5 ]
+  ret i32 %phi
 }
 
-declare i32 @zot()
+declare i32 @baz()
 
-declare void @quux()
+declare void @snork()
 
-attributes #0 = { returns_twice }
+attributes #0 = { "target-cpu"="apple-m1" }
+attributes #1 = { returns_twice }

Copy link
Collaborator

@davemgreen davemgreen left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for doing this - LGTM. You even managed to find two examples of it going wrong.

(I think the reason we see it being incorrect here and not in other cases it due to mcpu=apple using +zcm-gpr64 but not +zcm-gpr32, so the grp32 COPYs prefer to get lowered to a mov x, not a mov w as we would usually use. It means the top bits are not cleared by other movs).

@asb asb merged commit be9e287 into llvm:main Nov 16, 2025
12 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants