-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AArch64] Adjust the scheduling info of SVE FCMP on Cortex-A510. #153810
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
According to the SWOG, these have a lower throughput than other instructions. Mark them as taking multiple cycles to model that.
|
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesAccording to the SWOG, these have a lower throughput than other instructions. Mark them as taking multiple cycles to model that. Patch is 294.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153810.diff 10 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td
index b93d67f3091e7..356e3fa39c53f 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA510.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td
@@ -1016,7 +1016,7 @@ def : InstRW<[CortexA510MCWrite<16, 13, CortexA510UnitVALU>], (instrs FADDA_VPZ_
def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU>], (instrs FADDA_VPZ_D)>;
// Floating point compare
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
+def : InstRW<[CortexA510MCWrite<4, 2, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
"^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
"^FCM(LE|LT)_PPzZ0_[HSD]",
"^FCMUO_PPzZZ_[HSD]")>;
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
index d63f7e6f3242e..120ab7cc4552e 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
@@ -171,11 +171,11 @@ define <vscale x 8 x bfloat> @fptrunc_nxv8f32_to_nxv8bf16(<vscale x 8 x float> %
; NOBF16-NEXT: ptrue p0.s
; NOBF16-NEXT: and z3.s, z3.s, #0x1
; NOBF16-NEXT: and z4.s, z4.s, #0x1
-; NOBF16-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s
; NOBF16-NEXT: add z5.s, z1.s, z2.s
; NOBF16-NEXT: add z2.s, z0.s, z2.s
-; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
+; NOBF16-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s
; NOBF16-NEXT: orr z1.s, z1.s, #0x400000
+; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
; NOBF16-NEXT: orr z0.s, z0.s, #0x400000
; NOBF16-NEXT: add z3.s, z3.s, z5.s
; NOBF16-NEXT: add z2.s, z4.s, z2.s
diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
index 43744092a1348..71108f00a0054 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
@@ -94,8 +94,8 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
; CHECK-NEXT: mov z1.s, w8
; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z1.s
; CHECK-NEXT: mov z1.s, #32767 // =0x7fff
-; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
; CHECK-NEXT: fcvtzs z2.s, p1/m, z0.s
+; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
; CHECK-NEXT: sel z0.s, p2, z1.s, z2.s
; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0
; CHECK-NEXT: ret
@@ -264,37 +264,37 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
; CHECK-NEXT: mov z6.d, #0xffffffff80000000
; CHECK-NEXT: movk x8, #16863, lsl #48
; CHECK-NEXT: mov z7.d, #0xffffffff80000000
-; CHECK-NEXT: mov z24.d, #0xffffffff80000000
-; CHECK-NEXT: mov z25.d, x8
-; CHECK-NEXT: fcmuo p6.d, p0/z, z0.d, z0.d
+; CHECK-NEXT: mov z25.d, #0x7fffffff
+; CHECK-NEXT: mov z24.d, x8
; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z4.d
; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z4.d
; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, z4.d
-; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, z4.d
-; CHECK-NEXT: mov z4.d, #0x7fffffff
-; CHECK-NEXT: fcmgt p5.d, p0/z, z2.d, z25.d
+; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z24.d
; CHECK-NEXT: fcvtzs z5.d, p1/m, z1.d
-; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z25.d
+; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z24.d
; CHECK-NEXT: fcvtzs z6.d, p2/m, z0.d
+; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, z4.d
+; CHECK-NEXT: mov z4.d, #0xffffffff80000000
; CHECK-NEXT: fcvtzs z7.d, p3/m, z3.d
-; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z25.d
-; CHECK-NEXT: fcmgt p3.d, p0/z, z3.d, z25.d
-; CHECK-NEXT: fcvtzs z24.d, p4/m, z2.d
-; CHECK-NEXT: fcmuo p4.d, p0/z, z1.d, z1.d
-; CHECK-NEXT: sel z0.d, p1, z4.d, z5.d
-; CHECK-NEXT: fcmuo p1.d, p0/z, z3.d, z3.d
+; CHECK-NEXT: fcmuo p3.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: fcmgt p5.d, p0/z, z2.d, z24.d
+; CHECK-NEXT: sel z1.d, p1, z25.d, z5.d
+; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z24.d
+; CHECK-NEXT: fcvtzs z4.d, p2/m, z2.d
+; CHECK-NEXT: fcmuo p2.d, p0/z, z0.d, z0.d
+; CHECK-NEXT: sel z0.d, p4, z25.d, z6.d
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0
+; CHECK-NEXT: fcmuo p6.d, p0/z, z3.d, z3.d
; CHECK-NEXT: fcmuo p0.d, p0/z, z2.d, z2.d
-; CHECK-NEXT: sel z1.d, p2, z4.d, z6.d
-; CHECK-NEXT: sel z2.d, p3, z4.d, z7.d
-; CHECK-NEXT: sel z3.d, p5, z4.d, z24.d
+; CHECK-NEXT: sel z2.d, p1, z25.d, z7.d
+; CHECK-NEXT: sel z3.d, p5, z25.d, z4.d
; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: mov z1.d, p6/m, #0 // =0x0
-; CHECK-NEXT: mov z2.d, p1/m, #0 // =0x0
+; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0
+; CHECK-NEXT: mov z2.d, p6/m, #0 // =0x0
; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: uzp1 z1.s, z3.s, z2.s
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -348,41 +348,41 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
; CHECK-NEXT: mov z5.d, #-32768 // =0xffffffffffff8000
; CHECK-NEXT: mov z4.d, x8
; CHECK-NEXT: mov x8, #281200098803712 // =0xffc000000000
-; CHECK-NEXT: mov z6.d, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT: movk x8, #16607, lsl #48
; CHECK-NEXT: mov z7.d, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT: movk x8, #16607, lsl #48
+; CHECK-NEXT: mov z24.d, #-32768 // =0xffffffffffff8000
; CHECK-NEXT: mov z25.d, #32767 // =0x7fff
-; CHECK-NEXT: mov z24.d, x8
-; CHECK-NEXT: fcmuo p6.d, p0/z, z2.d, z2.d
+; CHECK-NEXT: mov z6.d, x8
; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, z4.d
; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, z4.d
; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, z4.d
-; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, z4.d
-; CHECK-NEXT: mov z4.d, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT: fcmgt p5.d, p0/z, z0.d, z24.d
+; CHECK-NEXT: fcmgt p4.d, p0/z, z2.d, z6.d
; CHECK-NEXT: fcvtzs z5.d, p1/m, z3.d
-; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z24.d
-; CHECK-NEXT: fcvtzs z6.d, p2/m, z2.d
-; CHECK-NEXT: fcvtzs z7.d, p3/m, z1.d
-; CHECK-NEXT: fcmgt p2.d, p0/z, z2.d, z24.d
-; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z24.d
-; CHECK-NEXT: fcvtzs z4.d, p4/m, z0.d
-; CHECK-NEXT: fcmuo p4.d, p0/z, z3.d, z3.d
-; CHECK-NEXT: sel z2.d, p1, z25.d, z5.d
-; CHECK-NEXT: fcmuo p1.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z6.d
+; CHECK-NEXT: fcvtzs z7.d, p2/m, z2.d
+; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z4.d
+; CHECK-NEXT: fcvtzs z24.d, p3/m, z1.d
+; CHECK-NEXT: fcmuo p3.d, p0/z, z3.d, z3.d
+; CHECK-NEXT: mov z3.d, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT: fcmgt p5.d, p0/z, z0.d, z6.d
+; CHECK-NEXT: sel z4.d, p1, z25.d, z5.d
+; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z6.d
+; CHECK-NEXT: fcvtzs z3.d, p2/m, z0.d
+; CHECK-NEXT: fcmuo p2.d, p0/z, z2.d, z2.d
+; CHECK-NEXT: fcmuo p6.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: mov z4.d, p3/m, #0 // =0x0
; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT: sel z0.d, p2, z25.d, z6.d
-; CHECK-NEXT: sel z1.d, p3, z25.d, z7.d
-; CHECK-NEXT: sel z3.d, p5, z25.d, z4.d
-; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT: sel z0.d, p4, z25.d, z7.d
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: mov z0.d, p6/m, #0 // =0x0
-; CHECK-NEXT: mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT: sel z1.d, p1, z25.d, z24.d
+; CHECK-NEXT: sel z2.d, p5, z25.d, z3.d
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0
+; CHECK-NEXT: mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT: mov z2.d, p0/m, #0 // =0x0
; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0
-; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s
-; CHECK-NEXT: uzp1 z1.s, z3.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z4.s
+; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s
; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -535,8 +535,8 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
; CHECK-NEXT: mov z1.h, w8
; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z1.h
; CHECK-NEXT: mov z1.s, #32767 // =0x7fff
-; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h
; CHECK-NEXT: fcvtzs z2.s, p1/m, z0.h
+; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h
; CHECK-NEXT: sel z0.s, p2, z1.s, z2.s
; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0
; CHECK-NEXT: ret
@@ -556,8 +556,8 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
; CHECK-NEXT: mov z1.h, w8
; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z1.h
; CHECK-NEXT: mov z1.h, #32767 // =0x7fff
-; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h
; CHECK-NEXT: fcvtzs z2.h, p1/m, z0.h
+; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h
; CHECK-NEXT: sel z0.h, p2, z1.h, z2.h
; CHECK-NEXT: mov z0.h, p0/m, #0 // =0x0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
index 1df28198711e1..123f6c55c20ab 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
@@ -49,16 +49,16 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
; CHECK-LABEL: test_signed_v8f32_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: movi v2.2d, #0000000000000000
; CHECK-NEXT: mov w8, #1333788671 // =0x4f7fffff
+; CHECK-NEXT: movi v2.2d, #0000000000000000
; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: mov z4.s, w8
; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, #0.0
; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, #0.0
; CHECK-NEXT: fcvtzu z2.s, p1/m, z0.s
; CHECK-NEXT: fcmgt p1.s, p0/z, z0.s, z4.s
-; CHECK-NEXT: fcmgt p0.s, p0/z, z1.s, z4.s
; CHECK-NEXT: fcvtzu z3.s, p2/m, z1.s
+; CHECK-NEXT: fcmgt p0.s, p0/z, z1.s, z4.s
; CHECK-NEXT: mov z2.s, p1/m, #-1 // =0xffffffffffffffff
; CHECK-NEXT: mov z3.s, p0/m, #-1 // =0xffffffffffffffff
; CHECK-NEXT: mov z0.d, z2.d
@@ -95,13 +95,13 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
; CHECK-NEXT: movk w8, #18303, lsl #16
; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, #0.0
-; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, #0.0
; CHECK-NEXT: mov z4.s, w8
+; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, #0.0
; CHECK-NEXT: fcvtzu z2.s, p1/m, z1.s
; CHECK-NEXT: fcmgt p1.s, p0/z, z1.s, z4.s
; CHECK-NEXT: mov z1.s, #65535 // =0xffff
-; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z4.s
; CHECK-NEXT: fcvtzu z3.s, p2/m, z0.s
+; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z4.s
; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s
; CHECK-NEXT: sel z1.s, p0, z1.s, z3.s
; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
@@ -141,8 +141,8 @@ define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
; CHECK-NEXT: fcmge p2.s, p0/z, z3.s, #0.0
; CHECK-NEXT: fcvtzu z0.d, p1/m, z2.s
; CHECK-NEXT: fcmgt p1.s, p0/z, z2.s, z4.s
-; CHECK-NEXT: fcmgt p0.s, p0/z, z3.s, z4.s
; CHECK-NEXT: fcvtzu z1.d, p2/m, z3.s
+; CHECK-NEXT: fcmgt p0.s, p0/z, z3.s, z4.s
; CHECK-NEXT: mov z0.d, p1/m, #-1 // =0xffffffffffffffff
; CHECK-NEXT: mov z1.d, p0/m, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ret
@@ -187,13 +187,13 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
; CHECK-NEXT: movk x8, #16879, lsl #48
; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0
-; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0
; CHECK-NEXT: mov z4.d, x8
+; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0
; CHECK-NEXT: fcvtzu z2.d, p1/m, z1.d
; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z4.d
; CHECK-NEXT: mov z1.d, #0xffffffff
-; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d
; CHECK-NEXT: fcvtzu z3.d, p2/m, z0.d
+; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d
; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d
; CHECK-NEXT: sel z1.d, p0, z1.d, z3.d
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
@@ -213,29 +213,29 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, #281474974613504 // =0xffffffe00000
; CHECK-NEXT: movi v4.2d, #0000000000000000
+; CHECK-NEXT: movk x8, #16879, lsl #48
; CHECK-NEXT: movi v5.2d, #0000000000000000
; CHECK-NEXT: movi v6.2d, #0000000000000000
-; CHECK-NEXT: movk x8, #16879, lsl #48
; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0
+; CHECK-NEXT: movi v24.2d, #0000000000000000
+; CHECK-NEXT: mov z7.d, x8
; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0
; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, #0.0
-; CHECK-NEXT: movi v7.2d, #0000000000000000
; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, #0.0
-; CHECK-NEXT: mov z24.d, x8
; CHECK-NEXT: fcvtzu z4.d, p1/m, z1.d
; CHECK-NEXT: fcvtzu z5.d, p2/m, z0.d
; CHECK-NEXT: fcvtzu z6.d, p3/m, z3.d
-; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z24.d
-; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z24.d
-; CHECK-NEXT: mov z0.d, #0xffffffff
-; CHECK-NEXT: fcvtzu z7.d, p4/m, z2.d
-; CHECK-NEXT: fcmgt p3.d, p0/z, z3.d, z24.d
+; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z7.d
+; CHECK-NEXT: fcvtzu z24.d, p4/m, z2.d
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: fcmgt p0.d, p0/z, z2.d, z24.d
+; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z7.d
+; CHECK-NEXT: mov z0.d, #0xffffffff
+; CHECK-NEXT: fcmgt p3.d, p0/z, z3.d, z7.d
+; CHECK-NEXT: fcmgt p0.d, p0/z, z2.d, z7.d
; CHECK-NEXT: sel z1.d, p1, z0.d, z4.d
; CHECK-NEXT: sel z2.d, p2, z0.d, z5.d
; CHECK-NEXT: sel z3.d, p3, z0.d, z6.d
-; CHECK-NEXT: sel z4.d, p0, z0.d, z7.d
+; CHECK-NEXT: sel z4.d, p0, z0.d, z24.d
; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s
; CHECK-NEXT: uzp1 z1.s, z4.s, z3.s
; CHECK-NEXT: addvl sp, sp, #1
@@ -254,13 +254,13 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
; CHECK-NEXT: movk x8, #16623, lsl #48
; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0
-; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0
; CHECK-NEXT: mov z4.d, x8
+; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0
; CHECK-NEXT: fcvtzu z2.d, p1/m, z1.d
; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z4.d
; CHECK-NEXT: mov z1.d, #65535 // =0xffff
-; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d
; CHECK-NEXT: fcvtzu z3.d, p2/m, z0.d
+; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d
; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d
; CHECK-NEXT: sel z1.d, p0, z1.d, z3.d
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
@@ -280,29 +280,29 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, #281337537757184 // =0xffe000000000
; CHECK-NEXT: movi v4.2d, #0000000000000000
+; CHECK-NEXT: movk x8, #16623, lsl #48
; CHECK-NEXT: movi v5.2d, #0000000000000000
; CHECK-NEXT: movi v6.2d, #0000000000000000
-; CHECK-NEXT: movk x8, #16623, lsl #48
; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, #0.0
+; CHECK-NEXT: movi v24.2d, #0000000000000000
+; CHECK-NEXT: mov z7.d, x8
; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, #0.0
; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, #0.0
-; CHECK-NEXT: movi v7.2d, #0000000000000000
; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, #0.0
-; CHECK-NEXT: mov z24.d, x8
; CHECK-NEXT: fcvtzu z4.d, p1/m, z3.d
; CHECK-NEXT: fcvtzu z5.d, p2/m, z2.d
; CHECK-NEXT: fcvtzu z6.d, p3/m, z1.d
-; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z24.d
-; CHECK-NEXT: fcmgt p2.d, p0/z, z2.d, z24.d
-; CHECK-NEXT: mov z2.d, #65535 // =0xffff
-; CHECK-NEXT: fcvtzu z7.d, p4/m, z0.d
-; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z24.d
+; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z7.d
+; CHECK-NEXT: fcvtzu z24.d, p4/m, z0.d
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z24.d
+; CHECK-NEXT: fcmgt p2.d, p0/z, z2.d, z7.d
+; CHECK-NEXT: mov z2.d, #65535 // =0xffff
+; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z7.d
+; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z7.d
; CHECK-NEXT: sel z0.d, p1, z2.d, z4.d
; CHECK-NEXT: sel z1.d, p2, z2.d, z5.d
; CHECK-NEXT: sel z3.d, p3, z2.d, z6.d
-; CHECK-NEXT: sel z2.d, p0, z2.d, z7.d
+; CHECK-NEXT: sel z2.d, p0, z2.d, z24.d
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
; CHECK-NEXT: uzp1 z1.s, z2.s, z3.s
; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
@@ -334,16 +334,16 @@ define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
; CHECK-LABEL: test_signed_v4f64_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: movi v2.2d, #0000000000000000
; CHECK-NEXT: mov x8, #4895412794951729151 // =0x43efffffffffffff
+; CHECK-NEXT: movi v2.2d, #0000000000000000
; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: mov z4.d, x8
; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, #0.0
; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, #0.0
; CHECK-NEXT: fcvtzu z2.d, p1/m, z0.d
; CHECK-NEXT: fcmgt p1.d, p0/z, z0.d, z4.d
-; CHECK-NEXT: fcmgt p0.d, p0/z, z1.d, z4.d
; CHECK-NEXT: fcvtzu z3.d, p2/m, z1.d
+; CHECK-NEXT: fcmgt p0.d, p0/z, z1.d, z4.d
; CHECK-NEXT: mov z2.d, p1/m, #-1 // =0xffffffffffffffff
; CHECK-NEXT: mov z3.d, p0/m, #-1 // =0xffffffffffffffff
; CHECK-NEXT: mov z0.d, z2.d
@@ -412,8 +412,8 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, #0.0
; CHECK-NEXT: fcvtzu z0.s, p1/m, z2.h
; CHECK-NEXT: fcmgt p1.h, p0/z, z2.h, z4.h
-; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z4.h
; CHECK-NEXT: fcvtzu z1.s, p2/m, z3.h
+; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z4.h
; CHECK-NEXT: mov z0.s, p1/m, #-1 // =0xffffffffffffffff
; CHECK-NEXT: mov z1.s, p0/m, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ret
@@ -486,8 +486,8 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, #0.0
; CHECK-NEXT: fcvtzu z0.d, p1/m, z2.h
; CHECK-NEXT: fcmgt p1.h, p0/z, z2.h, z4.h
-; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z4.h
; CHECK-NEXT: fcvtzu z1.d, p2/m, z3.h
+; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z4.h
; CHECK-NEXT: mov z0.d, p1/m, #-1 // =0xffffffffffffffff
; CHECK-NEXT: mov z1.d, p0/m, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-llrint.ll b/llvm/test/CodeGen/AArch64/sve-llrint.ll
index 12d49183edea4..d5a4838ff9687 100644
--- a/llvm/test/CodeGen/AArch64/sve-llrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-llrint.ll
@@ -6,17 +6,17 @@ define <vscale x 1 x i64> @llrint_v1i64_v1f16(<vscale x 1 x half> %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov w8, #64511 // =0xfbff
+; CHECK-NEXT: mov z2.d, #0x8000000000000000
; CHECK-NEXT: mov z1.h, w8
; CHECK-NEXT: mov w8, #31743 // =0x7bff
; CHECK-NEXT: frintx z0.h, p0/m, z0.h
-; CHECK-NEXT: mov z2.h, w8
; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: mov z1.d, #0x8000000000000000
-; CHECK-NEXT: fcvtzs z1.d, p1/m, z0.h
-; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z2.h
-; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT: mov z1.h, w8
+; CHECK-NEXT: fcvtzs z2.d, p1/m, z0.h
+; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z1.d, #0x7fffffffffffffff
; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT: sel z0.d, p1, z2.d, z1.d
+; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d
; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0
; CHECK-NEXT: ret
%a = call <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f16(<vscale x 1 x half> %x)
@@ -29,17 +29,17 @@ define <vscale x 2 x i64> @llrint_v1i64_v2f16(<vscale x 2 x half> %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: ...
[truncated]
|
Asher8118
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've not looked through all of the CodeGen test changes, but the code change itself makes sense and matches the SWOG and the MCA test looks fine. LGTM.
|
Thank you. I usually wouldn't do this so quickly but I'll commit this now to get it out of the way of another patch I was writing. |
|
@davemgreen it looks like this commit is causing an assertion failure in LLVM (also manifests as a crash in non-assertions-enabled clang builds): The test case is being reduced. |
|
Hello. Thanks for the heads up. It shouldn't be altering anything with -mcpu=neoverse-n1 AFAIU but let me know and I can take a look. |
|
@davemgreen I've been trying to reduce IR, but couldn't get below ~2.5MB. @pranavk came up with a standalone C++ test case: https://gcc.godbolt.org/z/d75xdP81e |
|
Thanks - there is an .ll reproducer here: https://gcc.godbolt.org/z/Pfsh4Ef6a. It is some issue with the debug info of z30_z31 registers and the size of scalable vectors. I will put together a patch to see about fixing it, one way or another. |
) The AArch64 zsub regs are scalable, so defined with a size of -1 (which comes through as 65535). The RegisterSize is only 128, so code to try and find overlapping regs of a z30_z31 in DwarfEmitter can crash on trying to access out of range bits in a BitVector. Hexagon and x86 also contain subregs with unknown sizes. Ideally most of these would be scalable values but in the meantime add a check that the register are small enough to overlap with the current register size, to prevent us from crashing. This fixes the issue reported on #153810.
According to the SWOG, these have a lower throughput than other instructions. Mark them as taking multiple cycles to model that.