Skip to content

Conversation

@davemgreen
Copy link
Collaborator

According to the SWOG, these have a lower throughput than other instructions. Mark them as taking multiple cycles to model that.

According to the SWOG, these have a lower throughput than other instructions.
Mark them as taking multiple cycles to model that.
@llvmbot
Copy link
Member

llvmbot commented Aug 15, 2025

@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

Changes

According to the SWOG, these have a lower throughput than other instructions. Mark them as taking multiple cycles to model that.


Patch is 294.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153810.diff

10 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64SchedA510.td (+1-1)
  • (modified) llvm/test/CodeGen/AArch64/sve-bf16-converts.ll (+2-2)
  • (modified) llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll (+50-50)
  • (modified) llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll (+33-33)
  • (modified) llvm/test/CodeGen/AArch64/sve-llrint.ll (+1030-1031)
  • (modified) llvm/test/CodeGen/AArch64/sve-lrint.ll (+1030-1031)
  • (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll (+21-21)
  • (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll (+74-74)
  • (modified) llvm/test/CodeGen/AArch64/sve2-bf16-converts.ll (+12-12)
  • (modified) llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s (+103-103)
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td
index b93d67f3091e7..356e3fa39c53f 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA510.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td
@@ -1016,7 +1016,7 @@ def : InstRW<[CortexA510MCWrite<16, 13, CortexA510UnitVALU>], (instrs FADDA_VPZ_
 def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU>], (instrs FADDA_VPZ_D)>;
 
 // Floating point compare
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
+def : InstRW<[CortexA510MCWrite<4, 2, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
                                             "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
                                             "^FCM(LE|LT)_PPzZ0_[HSD]",
                                             "^FCMUO_PPzZZ_[HSD]")>;
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
index d63f7e6f3242e..120ab7cc4552e 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
@@ -171,11 +171,11 @@ define <vscale x 8 x bfloat> @fptrunc_nxv8f32_to_nxv8bf16(<vscale x 8 x float> %
 ; NOBF16-NEXT:    ptrue p0.s
 ; NOBF16-NEXT:    and z3.s, z3.s, #0x1
 ; NOBF16-NEXT:    and z4.s, z4.s, #0x1
-; NOBF16-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
 ; NOBF16-NEXT:    add z5.s, z1.s, z2.s
 ; NOBF16-NEXT:    add z2.s, z0.s, z2.s
-; NOBF16-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; NOBF16-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
 ; NOBF16-NEXT:    orr z1.s, z1.s, #0x400000
+; NOBF16-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
 ; NOBF16-NEXT:    orr z0.s, z0.s, #0x400000
 ; NOBF16-NEXT:    add z3.s, z3.s, z5.s
 ; NOBF16-NEXT:    add z2.s, z4.s, z2.s
diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
index 43744092a1348..71108f00a0054 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
@@ -94,8 +94,8 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z1.s, #32767 // =0x7fff
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
 ; CHECK-NEXT:    fcvtzs z2.s, p1/m, z0.s
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
 ; CHECK-NEXT:    sel z0.s, p2, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
@@ -264,37 +264,37 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    mov z6.d, #0xffffffff80000000
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
 ; CHECK-NEXT:    mov z7.d, #0xffffffff80000000
-; CHECK-NEXT:    mov z24.d, #0xffffffff80000000
-; CHECK-NEXT:    mov z25.d, x8
-; CHECK-NEXT:    fcmuo p6.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z25.d, #0x7fffffff
+; CHECK-NEXT:    mov z24.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    mov z4.d, #0x7fffffff
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z2.d, z25.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z24.d
 ; CHECK-NEXT:    fcvtzs z5.d, p1/m, z1.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z25.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z24.d
 ; CHECK-NEXT:    fcvtzs z6.d, p2/m, z0.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, z4.d
+; CHECK-NEXT:    mov z4.d, #0xffffffff80000000
 ; CHECK-NEXT:    fcvtzs z7.d, p3/m, z3.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z25.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z3.d, z25.d
-; CHECK-NEXT:    fcvtzs z24.d, p4/m, z2.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z0.d, p1, z4.d, z5.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z2.d, z24.d
+; CHECK-NEXT:    sel z1.d, p1, z25.d, z5.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z24.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z2.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z0.d, p4, z25.d, z6.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z3.d, z3.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
-; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
-; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
+; CHECK-NEXT:    sel z2.d, p1, z25.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z25.d, z4.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.s, z3.s, z2.s
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -348,41 +348,41 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    mov z5.d, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
-; CHECK-NEXT:    mov z6.d, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    movk x8, #16607, lsl #48
 ; CHECK-NEXT:    mov z7.d, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    movk x8, #16607, lsl #48
+; CHECK-NEXT:    mov z24.d, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z25.d, #32767 // =0x7fff
-; CHECK-NEXT:    mov z24.d, x8
-; CHECK-NEXT:    fcmuo p6.d, p0/z, z2.d, z2.d
+; CHECK-NEXT:    mov z6.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    mov z4.d, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z0.d, z24.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z2.d, z6.d
 ; CHECK-NEXT:    fcvtzs z5.d, p1/m, z3.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z24.d
-; CHECK-NEXT:    fcvtzs z6.d, p2/m, z2.d
-; CHECK-NEXT:    fcvtzs z7.d, p3/m, z1.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z2.d, z24.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z24.d
-; CHECK-NEXT:    fcvtzs z4.d, p4/m, z0.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    sel z2.d, p1, z25.d, z5.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z6.d
+; CHECK-NEXT:    fcvtzs z7.d, p2/m, z2.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    fcvtzs z24.d, p3/m, z1.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    mov z3.d, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z0.d, z6.d
+; CHECK-NEXT:    sel z4.d, p1, z25.d, z5.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z6.d
+; CHECK-NEXT:    fcvtzs z3.d, p2/m, z0.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z2.d, z2.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    mov z4.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z0.d, p2, z25.d, z6.d
-; CHECK-NEXT:    sel z1.d, p3, z25.d, z7.d
-; CHECK-NEXT:    sel z3.d, p5, z25.d, z4.d
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p4, z25.d, z7.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z1.d, p1, z25.d, z24.d
+; CHECK-NEXT:    sel z2.d, p5, z25.d, z3.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
-; CHECK-NEXT:    uzp1 z1.s, z3.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z4.s
+; CHECK-NEXT:    uzp1 z1.s, z2.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -535,8 +535,8 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    mov z1.s, #32767 // =0x7fff
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
 ; CHECK-NEXT:    fcvtzs z2.s, p1/m, z0.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
 ; CHECK-NEXT:    sel z0.s, p2, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
@@ -556,8 +556,8 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    mov z1.h, #32767 // =0x7fff
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
 ; CHECK-NEXT:    fcvtzs z2.h, p1/m, z0.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
 ; CHECK-NEXT:    sel z0.h, p2, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
index 1df28198711e1..123f6c55c20ab 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
@@ -49,16 +49,16 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 ; CHECK-LABEL: test_signed_v8f32_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, #0.0
 ; CHECK-NEXT:    fcvtzu z2.s, p1/m, z0.s
 ; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z4.s
-; CHECK-NEXT:    fcmgt p0.s, p0/z, z1.s, z4.s
 ; CHECK-NEXT:    fcvtzu z3.s, p2/m, z1.s
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z1.s, z4.s
 ; CHECK-NEXT:    mov z2.s, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z2.d
@@ -95,13 +95,13 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 ; CHECK-NEXT:    movk w8, #18303, lsl #16
 ; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, #0.0
-; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    fcvtzu z2.s, p1/m, z1.s
 ; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z4.s
 ; CHECK-NEXT:    mov z1.s, #65535 // =0xffff
-; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z4.s
 ; CHECK-NEXT:    fcvtzu z3.s, p2/m, z0.s
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z4.s
 ; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
@@ -141,8 +141,8 @@ define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fcvtzu z0.d, p1/m, z2.s
 ; CHECK-NEXT:    fcmgt p1.s, p0/z, z2.s, z4.s
-; CHECK-NEXT:    fcmgt p0.s, p0/z, z3.s, z4.s
 ; CHECK-NEXT:    fcvtzu z1.d, p2/m, z3.s
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z3.s, z4.s
 ; CHECK-NEXT:    mov z0.d, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
@@ -187,13 +187,13 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
 ; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
-; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    fcvtzu z2.d, p1/m, z1.d
 ; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    mov z1.d, #0xffffffff
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcvtzu z3.d, p2/m, z0.d
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
@@ -213,29 +213,29 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
 ; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movk x8, #16879, lsl #48
 ; CHECK-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    movk x8, #16879, lsl #48
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    movi v24.2d, #0000000000000000
+; CHECK-NEXT:    mov z7.d, x8
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    movi v7.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    mov z24.d, x8
 ; CHECK-NEXT:    fcvtzu z4.d, p1/m, z1.d
 ; CHECK-NEXT:    fcvtzu z5.d, p2/m, z0.d
 ; CHECK-NEXT:    fcvtzu z6.d, p3/m, z3.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z24.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z24.d
-; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    fcvtzu z7.d, p4/m, z2.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z3.d, z24.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z7.d
+; CHECK-NEXT:    fcvtzu z24.d, p4/m, z2.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z2.d, z24.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z7.d
+; CHECK-NEXT:    mov z0.d, #0xffffffff
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z3.d, z7.d
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z2.d, z7.d
 ; CHECK-NEXT:    sel z1.d, p1, z0.d, z4.d
 ; CHECK-NEXT:    sel z2.d, p2, z0.d, z5.d
 ; CHECK-NEXT:    sel z3.d, p3, z0.d, z6.d
-; CHECK-NEXT:    sel z4.d, p0, z0.d, z7.d
+; CHECK-NEXT:    sel z4.d, p0, z0.d, z24.d
 ; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.s, z4.s, z3.s
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -254,13 +254,13 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK-NEXT:    movk x8, #16623, lsl #48
 ; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
-; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    fcvtzu z2.d, p1/m, z1.d
 ; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcvtzu z3.d, p2/m, z0.d
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
@@ -280,29 +280,29 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281337537757184 // =0xffe000000000
 ; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movk x8, #16623, lsl #48
 ; CHECK-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    movk x8, #16623, lsl #48
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, #0.0
+; CHECK-NEXT:    movi v24.2d, #0000000000000000
+; CHECK-NEXT:    mov z7.d, x8
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, #0.0
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, #0.0
-; CHECK-NEXT:    movi v7.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    mov z24.d, x8
 ; CHECK-NEXT:    fcvtzu z4.d, p1/m, z3.d
 ; CHECK-NEXT:    fcvtzu z5.d, p2/m, z2.d
 ; CHECK-NEXT:    fcvtzu z6.d, p3/m, z1.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z24.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z2.d, z24.d
-; CHECK-NEXT:    mov z2.d, #65535 // =0xffff
-; CHECK-NEXT:    fcvtzu z7.d, p4/m, z0.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z24.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z7.d
+; CHECK-NEXT:    fcvtzu z24.d, p4/m, z0.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z24.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z2.d, z7.d
+; CHECK-NEXT:    mov z2.d, #65535 // =0xffff
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z7.d
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z7.d
 ; CHECK-NEXT:    sel z0.d, p1, z2.d, z4.d
 ; CHECK-NEXT:    sel z1.d, p2, z2.d, z5.d
 ; CHECK-NEXT:    sel z3.d, p3, z2.d, z6.d
-; CHECK-NEXT:    sel z2.d, p0, z2.d, z7.d
+; CHECK-NEXT:    sel z2.d, p0, z2.d, z24.d
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z2.s, z3.s
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
@@ -334,16 +334,16 @@ define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    mov x8, #4895412794951729151 // =0x43efffffffffffff
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcvtzu z2.d, p1/m, z0.d
 ; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcvtzu z3.d, p2/m, z1.d
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    mov z2.d, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z2.d
@@ -412,8 +412,8 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fcvtzu z0.s, p1/m, z2.h
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z4.h
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z4.h
 ; CHECK-NEXT:    fcvtzu z1.s, p2/m, z3.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z4.h
 ; CHECK-NEXT:    mov z0.s, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
@@ -486,8 +486,8 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fcvtzu z0.d, p1/m, z2.h
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z4.h
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z4.h
 ; CHECK-NEXT:    fcvtzu z1.d, p2/m, z3.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z4.h
 ; CHECK-NEXT:    mov z0.d, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-llrint.ll b/llvm/test/CodeGen/AArch64/sve-llrint.ll
index 12d49183edea4..d5a4838ff9687 100644
--- a/llvm/test/CodeGen/AArch64/sve-llrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-llrint.ll
@@ -6,17 +6,17 @@ define <vscale x 1 x i64> @llrint_v1i64_v1f16(<vscale x 1 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.d, #0x8000000000000000
-; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f16(<vscale x 1 x half> %x)
@@ -29,17 +29,17 @@ define <vscale x 2 x i64> @llrint_v1i64_v2f16(<vscale x 2 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ...
[truncated]

Copy link
Contributor

@Asher8118 Asher8118 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've not looked through all of the CodeGen test changes, but the code change itself makes sense and matches the SWOG and the MCA test looks fine. LGTM.

@davemgreen
Copy link
Collaborator Author

Thank you.

I usually wouldn't do this so quickly but I'll commit this now to get it out of the way of another patch I was writing.

@davemgreen davemgreen merged commit 144f3c4 into llvm:main Aug 15, 2025
11 checks passed
@davemgreen davemgreen deleted the gh-a64-a510fcmp branch August 15, 2025 14:45
@alexfh
Copy link
Contributor

alexfh commented Sep 20, 2025

@davemgreen it looks like this commit is causing an assertion failure in LLVM (also manifests as a crash in non-assertions-enabled clang builds):

assertion failed at llvm/include/llvm/ADT/SmallBitVector.h:389 in SmallBitVector &llvm::SmallBitVector::set(unsigned int, unsigned int): E <= size() && "Attempted to set out-of-bounds range!"
    @     0x555998d1a904  __assert_fail
    @     0x5559953c3e05  llvm::SmallBitVector::set()
    @     0x555996b62d1e  llvm::DwarfExpression::addMachineReg()
    @     0x555996b63805  llvm::DwarfExpression::addMachineRegExpression()
    @     0x555996b2a00a  llvm::DwarfDebug::emitDebugLocValue()::$_0::operator()()
    @     0x555996b29eeb  llvm::DwarfDebug::emitDebugLocValue()
    @     0x555996b24759  llvm::DebugLocEntry::finalize()
    @     0x555996b23c8c  llvm::DwarfDebug::collectEntityInfo()
    @     0x555996b276ce  llvm::DwarfDebug::endFunctionImpl()
    @     0x555996b10e98  llvm::DebugHandlerBase::endFunction()
    @     0x555996acf293  llvm::AsmPrinter::emitFunctionBody()
    @     0x55599544bf89  (anonymous namespace)::AArch64AsmPrinter::runOnMachineFunction()
    @     0x555996efdaa8  llvm::MachineFunctionPass::runOnFunction()
    @     0x5559989981e2  llvm::FPPassManager::runOnFunction()
    @     0x55599899f2b2  llvm::FPPassManager::runOnModule()
    @     0x555998998908  llvm::legacy::PassManagerImpl::run()
    @     0x555992e3ec7b  clang::emitBackendOutput()
    @     0x555992adc08e  clang::CodeGenAction::ExecuteAction()
    @     0x555993819c58  clang::FrontendAction::Execute()
    @     0x555993786a74  clang::CompilerInstance::ExecuteAction()
    @     0x555992ad4fa9  clang::ExecuteCompilerInvocation()
    @     0x555992ad1ae2  cc1_main()
...
0.      Program arguments: clang -O3 --target=aarch64-unknown-linux-gnu -mcpu=neoverse-n1 -c -o /dev/null /tmp/llvm-reduce-tmp.wN6jE/reduced.ll
1.      Code generation
2.      Running pass 'Function Pass Manager' on module '/tmp/llvm-reduce-tmp.wN6jE/reduced.ll'.
3.      Running pass 'AArch64 Assembly Printer' on function '@_ZN3hwy6N_SVE26detail10Sort16RowsILm16ENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingINS_9float16_tEEEEEEES6_EEvT0_PT1_mSC_'
clang: error: clang frontend command failed with exit code 134 (use -v to see invocation)
clang version 9999.0.0 (144f3c4cbf7164938eb1b6fea1688ac428ead9c6)
Target: aarch64-unknown-linux-gnu
Thread model: posix

The test case is being reduced.

@davemgreen
Copy link
Collaborator Author

Hello. Thanks for the heads up. It shouldn't be altering anything with -mcpu=neoverse-n1 AFAIU but let me know and I can take a look.

@alexfh
Copy link
Contributor

alexfh commented Sep 22, 2025

@davemgreen I've been trying to reduce IR, but couldn't get below ~2.5MB. @pranavk came up with a standalone C++ test case: https://gcc.godbolt.org/z/d75xdP81e

@davemgreen
Copy link
Collaborator Author

Thanks - there is an .ll reproducer here: https://gcc.godbolt.org/z/Pfsh4Ef6a. It is some issue with the debug info of z30_z31 registers and the size of scalable vectors. I will put together a patch to see about fixing it, one way or another.

alexfh pushed a commit that referenced this pull request Sep 24, 2025
)

The AArch64 zsub regs are scalable, so defined with a size of -1 (which
comes through as 65535). The RegisterSize is only 128, so code to try
and find overlapping regs of a z30_z31 in DwarfEmitter can crash on
trying to access out of range bits in a BitVector. Hexagon and x86 also
contain subregs with unknown sizes.

Ideally most of these would be scalable values but in the meantime add a
check that the register are small enough to overlap with the current
register size, to prevent us from crashing.

This fixes the issue reported on #153810.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants