Skip to content

Commit 144f3c4

Browse files
authored
[AArch64] Adjust the scheduling info of SVE FCMP on Cortex-A510. (#153810)
According to the SWOG, these have a lower throughput than other instructions. Mark them as taking multiple cycles to model that.
1 parent d719954 commit 144f3c4

File tree

10 files changed

+2356
-2358
lines changed

10 files changed

+2356
-2358
lines changed

llvm/lib/Target/AArch64/AArch64SchedA510.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1016,7 +1016,7 @@ def : InstRW<[CortexA510MCWrite<16, 13, CortexA510UnitVALU>], (instrs FADDA_VPZ_
10161016
def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU>], (instrs FADDA_VPZ_D)>;
10171017

10181018
// Floating point compare
1019-
def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
1019+
def : InstRW<[CortexA510MCWrite<4, 2, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
10201020
"^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
10211021
"^FCM(LE|LT)_PPzZ0_[HSD]",
10221022
"^FCMUO_PPzZZ_[HSD]")>;

llvm/test/CodeGen/AArch64/sve-bf16-converts.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,11 +171,11 @@ define <vscale x 8 x bfloat> @fptrunc_nxv8f32_to_nxv8bf16(<vscale x 8 x float> %
171171
; NOBF16-NEXT: ptrue p0.s
172172
; NOBF16-NEXT: and z3.s, z3.s, #0x1
173173
; NOBF16-NEXT: and z4.s, z4.s, #0x1
174-
; NOBF16-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s
175174
; NOBF16-NEXT: add z5.s, z1.s, z2.s
176175
; NOBF16-NEXT: add z2.s, z0.s, z2.s
177-
; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
176+
; NOBF16-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s
178177
; NOBF16-NEXT: orr z1.s, z1.s, #0x400000
178+
; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
179179
; NOBF16-NEXT: orr z0.s, z0.s, #0x400000
180180
; NOBF16-NEXT: add z3.s, z3.s, z5.s
181181
; NOBF16-NEXT: add z2.s, z4.s, z2.s

llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll

Lines changed: 50 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,8 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
9494
; CHECK-NEXT: mov z1.s, w8
9595
; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z1.s
9696
; CHECK-NEXT: mov z1.s, #32767 // =0x7fff
97-
; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
9897
; CHECK-NEXT: fcvtzs z2.s, p1/m, z0.s
98+
; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
9999
; CHECK-NEXT: sel z0.s, p2, z1.s, z2.s
100100
; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0
101101
; CHECK-NEXT: ret
@@ -264,37 +264,37 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
264264
; CHECK-NEXT: mov z6.d, #0xffffffff80000000
265265
; CHECK-NEXT: movk x8, #16863, lsl #48
266266
; CHECK-NEXT: mov z7.d, #0xffffffff80000000
267-
; CHECK-NEXT: mov z24.d, #0xffffffff80000000
268-
; CHECK-NEXT: mov z25.d, x8
269-
; CHECK-NEXT: fcmuo p6.d, p0/z, z0.d, z0.d
267+
; CHECK-NEXT: mov z25.d, #0x7fffffff
268+
; CHECK-NEXT: mov z24.d, x8
270269
; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z4.d
271270
; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z4.d
272271
; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, z4.d
273-
; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, z4.d
274-
; CHECK-NEXT: mov z4.d, #0x7fffffff
275-
; CHECK-NEXT: fcmgt p5.d, p0/z, z2.d, z25.d
272+
; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z24.d
276273
; CHECK-NEXT: fcvtzs z5.d, p1/m, z1.d
277-
; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z25.d
274+
; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z24.d
278275
; CHECK-NEXT: fcvtzs z6.d, p2/m, z0.d
276+
; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, z4.d
277+
; CHECK-NEXT: mov z4.d, #0xffffffff80000000
279278
; CHECK-NEXT: fcvtzs z7.d, p3/m, z3.d
280-
; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z25.d
281-
; CHECK-NEXT: fcmgt p3.d, p0/z, z3.d, z25.d
282-
; CHECK-NEXT: fcvtzs z24.d, p4/m, z2.d
283-
; CHECK-NEXT: fcmuo p4.d, p0/z, z1.d, z1.d
284-
; CHECK-NEXT: sel z0.d, p1, z4.d, z5.d
285-
; CHECK-NEXT: fcmuo p1.d, p0/z, z3.d, z3.d
279+
; CHECK-NEXT: fcmuo p3.d, p0/z, z1.d, z1.d
280+
; CHECK-NEXT: fcmgt p5.d, p0/z, z2.d, z24.d
281+
; CHECK-NEXT: sel z1.d, p1, z25.d, z5.d
282+
; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z24.d
283+
; CHECK-NEXT: fcvtzs z4.d, p2/m, z2.d
284+
; CHECK-NEXT: fcmuo p2.d, p0/z, z0.d, z0.d
285+
; CHECK-NEXT: sel z0.d, p4, z25.d, z6.d
286+
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
287+
; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0
288+
; CHECK-NEXT: fcmuo p6.d, p0/z, z3.d, z3.d
286289
; CHECK-NEXT: fcmuo p0.d, p0/z, z2.d, z2.d
287-
; CHECK-NEXT: sel z1.d, p2, z4.d, z6.d
288-
; CHECK-NEXT: sel z2.d, p3, z4.d, z7.d
289-
; CHECK-NEXT: sel z3.d, p5, z4.d, z24.d
290+
; CHECK-NEXT: sel z2.d, p1, z25.d, z7.d
291+
; CHECK-NEXT: sel z3.d, p5, z25.d, z4.d
290292
; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
291-
; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0
292-
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
293-
; CHECK-NEXT: mov z1.d, p6/m, #0 // =0x0
294-
; CHECK-NEXT: mov z2.d, p1/m, #0 // =0x0
293+
; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0
294+
; CHECK-NEXT: mov z2.d, p6/m, #0 // =0x0
295295
; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
296296
; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0
297-
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
297+
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
298298
; CHECK-NEXT: uzp1 z1.s, z3.s, z2.s
299299
; CHECK-NEXT: addvl sp, sp, #1
300300
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -348,41 +348,41 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
348348
; CHECK-NEXT: mov z5.d, #-32768 // =0xffffffffffff8000
349349
; CHECK-NEXT: mov z4.d, x8
350350
; CHECK-NEXT: mov x8, #281200098803712 // =0xffc000000000
351-
; CHECK-NEXT: mov z6.d, #-32768 // =0xffffffffffff8000
352-
; CHECK-NEXT: movk x8, #16607, lsl #48
353351
; CHECK-NEXT: mov z7.d, #-32768 // =0xffffffffffff8000
352+
; CHECK-NEXT: movk x8, #16607, lsl #48
353+
; CHECK-NEXT: mov z24.d, #-32768 // =0xffffffffffff8000
354354
; CHECK-NEXT: mov z25.d, #32767 // =0x7fff
355-
; CHECK-NEXT: mov z24.d, x8
356-
; CHECK-NEXT: fcmuo p6.d, p0/z, z2.d, z2.d
355+
; CHECK-NEXT: mov z6.d, x8
357356
; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, z4.d
358357
; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, z4.d
359358
; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, z4.d
360-
; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, z4.d
361-
; CHECK-NEXT: mov z4.d, #-32768 // =0xffffffffffff8000
362-
; CHECK-NEXT: fcmgt p5.d, p0/z, z0.d, z24.d
359+
; CHECK-NEXT: fcmgt p4.d, p0/z, z2.d, z6.d
363360
; CHECK-NEXT: fcvtzs z5.d, p1/m, z3.d
364-
; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z24.d
365-
; CHECK-NEXT: fcvtzs z6.d, p2/m, z2.d
366-
; CHECK-NEXT: fcvtzs z7.d, p3/m, z1.d
367-
; CHECK-NEXT: fcmgt p2.d, p0/z, z2.d, z24.d
368-
; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z24.d
369-
; CHECK-NEXT: fcvtzs z4.d, p4/m, z0.d
370-
; CHECK-NEXT: fcmuo p4.d, p0/z, z3.d, z3.d
371-
; CHECK-NEXT: sel z2.d, p1, z25.d, z5.d
372-
; CHECK-NEXT: fcmuo p1.d, p0/z, z1.d, z1.d
361+
; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z6.d
362+
; CHECK-NEXT: fcvtzs z7.d, p2/m, z2.d
363+
; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z4.d
364+
; CHECK-NEXT: fcvtzs z24.d, p3/m, z1.d
365+
; CHECK-NEXT: fcmuo p3.d, p0/z, z3.d, z3.d
366+
; CHECK-NEXT: mov z3.d, #-32768 // =0xffffffffffff8000
367+
; CHECK-NEXT: fcmgt p5.d, p0/z, z0.d, z6.d
368+
; CHECK-NEXT: sel z4.d, p1, z25.d, z5.d
369+
; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z6.d
370+
; CHECK-NEXT: fcvtzs z3.d, p2/m, z0.d
371+
; CHECK-NEXT: fcmuo p2.d, p0/z, z2.d, z2.d
372+
; CHECK-NEXT: fcmuo p6.d, p0/z, z1.d, z1.d
373+
; CHECK-NEXT: mov z4.d, p3/m, #0 // =0x0
373374
; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d
374-
; CHECK-NEXT: sel z0.d, p2, z25.d, z6.d
375-
; CHECK-NEXT: sel z1.d, p3, z25.d, z7.d
376-
; CHECK-NEXT: sel z3.d, p5, z25.d, z4.d
377-
; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
378-
; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0
375+
; CHECK-NEXT: sel z0.d, p4, z25.d, z7.d
379376
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
380-
; CHECK-NEXT: mov z0.d, p6/m, #0 // =0x0
381-
; CHECK-NEXT: mov z1.d, p1/m, #0 // =0x0
377+
; CHECK-NEXT: sel z1.d, p1, z25.d, z24.d
378+
; CHECK-NEXT: sel z2.d, p5, z25.d, z3.d
379+
; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
380+
; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0
381+
; CHECK-NEXT: mov z1.d, p6/m, #0 // =0x0
382+
; CHECK-NEXT: mov z2.d, p0/m, #0 // =0x0
382383
; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
383-
; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0
384-
; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s
385-
; CHECK-NEXT: uzp1 z1.s, z3.s, z1.s
384+
; CHECK-NEXT: uzp1 z0.s, z0.s, z4.s
385+
; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s
386386
; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
387387
; CHECK-NEXT: addvl sp, sp, #1
388388
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -535,8 +535,8 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
535535
; CHECK-NEXT: mov z1.h, w8
536536
; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z1.h
537537
; CHECK-NEXT: mov z1.s, #32767 // =0x7fff
538-
; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h
539538
; CHECK-NEXT: fcvtzs z2.s, p1/m, z0.h
539+
; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h
540540
; CHECK-NEXT: sel z0.s, p2, z1.s, z2.s
541541
; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0
542542
; CHECK-NEXT: ret
@@ -556,8 +556,8 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
556556
; CHECK-NEXT: mov z1.h, w8
557557
; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z1.h
558558
; CHECK-NEXT: mov z1.h, #32767 // =0x7fff
559-
; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h
560559
; CHECK-NEXT: fcvtzs z2.h, p1/m, z0.h
560+
; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h
561561
; CHECK-NEXT: sel z0.h, p2, z1.h, z2.h
562562
; CHECK-NEXT: mov z0.h, p0/m, #0 // =0x0
563563
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,16 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
4949
; CHECK-LABEL: test_signed_v8f32_v8i32:
5050
; CHECK: // %bb.0:
5151
; CHECK-NEXT: ptrue p0.s
52-
; CHECK-NEXT: movi v2.2d, #0000000000000000
5352
; CHECK-NEXT: mov w8, #1333788671 // =0x4f7fffff
53+
; CHECK-NEXT: movi v2.2d, #0000000000000000
5454
; CHECK-NEXT: movi v3.2d, #0000000000000000
5555
; CHECK-NEXT: mov z4.s, w8
5656
; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, #0.0
5757
; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, #0.0
5858
; CHECK-NEXT: fcvtzu z2.s, p1/m, z0.s
5959
; CHECK-NEXT: fcmgt p1.s, p0/z, z0.s, z4.s
60-
; CHECK-NEXT: fcmgt p0.s, p0/z, z1.s, z4.s
6160
; CHECK-NEXT: fcvtzu z3.s, p2/m, z1.s
61+
; CHECK-NEXT: fcmgt p0.s, p0/z, z1.s, z4.s
6262
; CHECK-NEXT: mov z2.s, p1/m, #-1 // =0xffffffffffffffff
6363
; CHECK-NEXT: mov z3.s, p0/m, #-1 // =0xffffffffffffffff
6464
; CHECK-NEXT: mov z0.d, z2.d
@@ -95,13 +95,13 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
9595
; CHECK-NEXT: movk w8, #18303, lsl #16
9696
; CHECK-NEXT: movi v3.2d, #0000000000000000
9797
; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, #0.0
98-
; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, #0.0
9998
; CHECK-NEXT: mov z4.s, w8
99+
; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, #0.0
100100
; CHECK-NEXT: fcvtzu z2.s, p1/m, z1.s
101101
; CHECK-NEXT: fcmgt p1.s, p0/z, z1.s, z4.s
102102
; CHECK-NEXT: mov z1.s, #65535 // =0xffff
103-
; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z4.s
104103
; CHECK-NEXT: fcvtzu z3.s, p2/m, z0.s
104+
; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z4.s
105105
; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s
106106
; CHECK-NEXT: sel z1.s, p0, z1.s, z3.s
107107
; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
@@ -141,8 +141,8 @@ define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
141141
; CHECK-NEXT: fcmge p2.s, p0/z, z3.s, #0.0
142142
; CHECK-NEXT: fcvtzu z0.d, p1/m, z2.s
143143
; CHECK-NEXT: fcmgt p1.s, p0/z, z2.s, z4.s
144-
; CHECK-NEXT: fcmgt p0.s, p0/z, z3.s, z4.s
145144
; CHECK-NEXT: fcvtzu z1.d, p2/m, z3.s
145+
; CHECK-NEXT: fcmgt p0.s, p0/z, z3.s, z4.s
146146
; CHECK-NEXT: mov z0.d, p1/m, #-1 // =0xffffffffffffffff
147147
; CHECK-NEXT: mov z1.d, p0/m, #-1 // =0xffffffffffffffff
148148
; CHECK-NEXT: ret
@@ -187,13 +187,13 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
187187
; CHECK-NEXT: movk x8, #16879, lsl #48
188188
; CHECK-NEXT: movi v3.2d, #0000000000000000
189189
; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0
190-
; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0
191190
; CHECK-NEXT: mov z4.d, x8
191+
; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0
192192
; CHECK-NEXT: fcvtzu z2.d, p1/m, z1.d
193193
; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z4.d
194194
; CHECK-NEXT: mov z1.d, #0xffffffff
195-
; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d
196195
; CHECK-NEXT: fcvtzu z3.d, p2/m, z0.d
196+
; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d
197197
; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d
198198
; CHECK-NEXT: sel z1.d, p0, z1.d, z3.d
199199
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
@@ -213,29 +213,29 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
213213
; CHECK-NEXT: ptrue p0.d
214214
; CHECK-NEXT: mov x8, #281474974613504 // =0xffffffe00000
215215
; CHECK-NEXT: movi v4.2d, #0000000000000000
216+
; CHECK-NEXT: movk x8, #16879, lsl #48
216217
; CHECK-NEXT: movi v5.2d, #0000000000000000
217218
; CHECK-NEXT: movi v6.2d, #0000000000000000
218-
; CHECK-NEXT: movk x8, #16879, lsl #48
219219
; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0
220+
; CHECK-NEXT: movi v24.2d, #0000000000000000
221+
; CHECK-NEXT: mov z7.d, x8
220222
; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0
221223
; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, #0.0
222-
; CHECK-NEXT: movi v7.2d, #0000000000000000
223224
; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, #0.0
224-
; CHECK-NEXT: mov z24.d, x8
225225
; CHECK-NEXT: fcvtzu z4.d, p1/m, z1.d
226226
; CHECK-NEXT: fcvtzu z5.d, p2/m, z0.d
227227
; CHECK-NEXT: fcvtzu z6.d, p3/m, z3.d
228-
; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z24.d
229-
; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z24.d
230-
; CHECK-NEXT: mov z0.d, #0xffffffff
231-
; CHECK-NEXT: fcvtzu z7.d, p4/m, z2.d
232-
; CHECK-NEXT: fcmgt p3.d, p0/z, z3.d, z24.d
228+
; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z7.d
229+
; CHECK-NEXT: fcvtzu z24.d, p4/m, z2.d
233230
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
234-
; CHECK-NEXT: fcmgt p0.d, p0/z, z2.d, z24.d
231+
; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z7.d
232+
; CHECK-NEXT: mov z0.d, #0xffffffff
233+
; CHECK-NEXT: fcmgt p3.d, p0/z, z3.d, z7.d
234+
; CHECK-NEXT: fcmgt p0.d, p0/z, z2.d, z7.d
235235
; CHECK-NEXT: sel z1.d, p1, z0.d, z4.d
236236
; CHECK-NEXT: sel z2.d, p2, z0.d, z5.d
237237
; CHECK-NEXT: sel z3.d, p3, z0.d, z6.d
238-
; CHECK-NEXT: sel z4.d, p0, z0.d, z7.d
238+
; CHECK-NEXT: sel z4.d, p0, z0.d, z24.d
239239
; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s
240240
; CHECK-NEXT: uzp1 z1.s, z4.s, z3.s
241241
; CHECK-NEXT: addvl sp, sp, #1
@@ -254,13 +254,13 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
254254
; CHECK-NEXT: movk x8, #16623, lsl #48
255255
; CHECK-NEXT: movi v3.2d, #0000000000000000
256256
; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0
257-
; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0
258257
; CHECK-NEXT: mov z4.d, x8
258+
; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0
259259
; CHECK-NEXT: fcvtzu z2.d, p1/m, z1.d
260260
; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z4.d
261261
; CHECK-NEXT: mov z1.d, #65535 // =0xffff
262-
; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d
263262
; CHECK-NEXT: fcvtzu z3.d, p2/m, z0.d
263+
; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d
264264
; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d
265265
; CHECK-NEXT: sel z1.d, p0, z1.d, z3.d
266266
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
@@ -280,29 +280,29 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
280280
; CHECK-NEXT: ptrue p0.d
281281
; CHECK-NEXT: mov x8, #281337537757184 // =0xffe000000000
282282
; CHECK-NEXT: movi v4.2d, #0000000000000000
283+
; CHECK-NEXT: movk x8, #16623, lsl #48
283284
; CHECK-NEXT: movi v5.2d, #0000000000000000
284285
; CHECK-NEXT: movi v6.2d, #0000000000000000
285-
; CHECK-NEXT: movk x8, #16623, lsl #48
286286
; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, #0.0
287+
; CHECK-NEXT: movi v24.2d, #0000000000000000
288+
; CHECK-NEXT: mov z7.d, x8
287289
; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, #0.0
288290
; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, #0.0
289-
; CHECK-NEXT: movi v7.2d, #0000000000000000
290291
; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, #0.0
291-
; CHECK-NEXT: mov z24.d, x8
292292
; CHECK-NEXT: fcvtzu z4.d, p1/m, z3.d
293293
; CHECK-NEXT: fcvtzu z5.d, p2/m, z2.d
294294
; CHECK-NEXT: fcvtzu z6.d, p3/m, z1.d
295-
; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z24.d
296-
; CHECK-NEXT: fcmgt p2.d, p0/z, z2.d, z24.d
297-
; CHECK-NEXT: mov z2.d, #65535 // =0xffff
298-
; CHECK-NEXT: fcvtzu z7.d, p4/m, z0.d
299-
; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z24.d
295+
; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z7.d
296+
; CHECK-NEXT: fcvtzu z24.d, p4/m, z0.d
300297
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
301-
; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z24.d
298+
; CHECK-NEXT: fcmgt p2.d, p0/z, z2.d, z7.d
299+
; CHECK-NEXT: mov z2.d, #65535 // =0xffff
300+
; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z7.d
301+
; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z7.d
302302
; CHECK-NEXT: sel z0.d, p1, z2.d, z4.d
303303
; CHECK-NEXT: sel z1.d, p2, z2.d, z5.d
304304
; CHECK-NEXT: sel z3.d, p3, z2.d, z6.d
305-
; CHECK-NEXT: sel z2.d, p0, z2.d, z7.d
305+
; CHECK-NEXT: sel z2.d, p0, z2.d, z24.d
306306
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
307307
; CHECK-NEXT: uzp1 z1.s, z2.s, z3.s
308308
; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
@@ -334,16 +334,16 @@ define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
334334
; CHECK-LABEL: test_signed_v4f64_v4i64:
335335
; CHECK: // %bb.0:
336336
; CHECK-NEXT: ptrue p0.d
337-
; CHECK-NEXT: movi v2.2d, #0000000000000000
338337
; CHECK-NEXT: mov x8, #4895412794951729151 // =0x43efffffffffffff
338+
; CHECK-NEXT: movi v2.2d, #0000000000000000
339339
; CHECK-NEXT: movi v3.2d, #0000000000000000
340340
; CHECK-NEXT: mov z4.d, x8
341341
; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, #0.0
342342
; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, #0.0
343343
; CHECK-NEXT: fcvtzu z2.d, p1/m, z0.d
344344
; CHECK-NEXT: fcmgt p1.d, p0/z, z0.d, z4.d
345-
; CHECK-NEXT: fcmgt p0.d, p0/z, z1.d, z4.d
346345
; CHECK-NEXT: fcvtzu z3.d, p2/m, z1.d
346+
; CHECK-NEXT: fcmgt p0.d, p0/z, z1.d, z4.d
347347
; CHECK-NEXT: mov z2.d, p1/m, #-1 // =0xffffffffffffffff
348348
; CHECK-NEXT: mov z3.d, p0/m, #-1 // =0xffffffffffffffff
349349
; CHECK-NEXT: mov z0.d, z2.d
@@ -412,8 +412,8 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
412412
; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, #0.0
413413
; CHECK-NEXT: fcvtzu z0.s, p1/m, z2.h
414414
; CHECK-NEXT: fcmgt p1.h, p0/z, z2.h, z4.h
415-
; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z4.h
416415
; CHECK-NEXT: fcvtzu z1.s, p2/m, z3.h
416+
; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z4.h
417417
; CHECK-NEXT: mov z0.s, p1/m, #-1 // =0xffffffffffffffff
418418
; CHECK-NEXT: mov z1.s, p0/m, #-1 // =0xffffffffffffffff
419419
; CHECK-NEXT: ret
@@ -486,8 +486,8 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
486486
; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, #0.0
487487
; CHECK-NEXT: fcvtzu z0.d, p1/m, z2.h
488488
; CHECK-NEXT: fcmgt p1.h, p0/z, z2.h, z4.h
489-
; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z4.h
490489
; CHECK-NEXT: fcvtzu z1.d, p2/m, z3.h
490+
; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z4.h
491491
; CHECK-NEXT: mov z0.d, p1/m, #-1 // =0xffffffffffffffff
492492
; CHECK-NEXT: mov z1.d, p0/m, #-1 // =0xffffffffffffffff
493493
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)