Skip to content

Commit f07ab32

Browse files
committed
[AArch64][ISel] Select constructive EXT_ZZZI pseudo instruction
The patch changes existing patterns to select the EXT_ZZZI pseudo instead of the EXT_ZZI destructive instruction for vector_splice. Given that registers aren't tied anymore, this gives the register allocator more freedom and a lot of MOVs get replaced with MOVPRFX. In some cases however, we could have just chosen the same input and output register, but regalloc preferred not to. This means we end up with some test cases now having more instructions: there is now a MOVPRFX while no MOV was previously needed.
1 parent a6be08b commit f07ab32

21 files changed

+1197
-1023
lines changed

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2135,19 +2135,19 @@ let Predicates = [HasSVE_or_SME] in {
21352135
// Splice with lane bigger or equal to 0
21362136
foreach VT = [nxv16i8] in
21372137
def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_255 i32:$index)))),
2138-
(EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
2138+
(EXT_ZZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
21392139

21402140
foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in
21412141
def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_127 i32:$index)))),
2142-
(EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
2142+
(EXT_ZZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
21432143

21442144
foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in
21452145
def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_63 i32:$index)))),
2146-
(EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
2146+
(EXT_ZZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
21472147

21482148
foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in
21492149
def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_31 i32:$index)))),
2150-
(EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
2150+
(EXT_ZZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
21512151

21522152
defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
21532153
defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;

llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -256,12 +256,13 @@ define <vscale x 2 x double> @splice_nxv2f64_last_idx(<vscale x 2 x double> %a,
256256
define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
257257
; CHECK-LABEL: splice_nxv2i1_idx:
258258
; CHECK: // %bb.0:
259-
; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1
260259
; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1
260+
; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1
261261
; CHECK-NEXT: ptrue p0.d
262-
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8
263-
; CHECK-NEXT: and z1.d, z1.d, #0x1
264-
; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
262+
; CHECK-NEXT: mov z0.d, z1.d
263+
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
264+
; CHECK-NEXT: and z0.d, z0.d, #0x1
265+
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
265266
; CHECK-NEXT: ret
266267
%res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 1)
267268
ret <vscale x 2 x i1> %res
@@ -271,12 +272,13 @@ define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x
271272
define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) #0 {
272273
; CHECK-LABEL: splice_nxv4i1_idx:
273274
; CHECK: // %bb.0:
274-
; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1
275275
; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1
276+
; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1
276277
; CHECK-NEXT: ptrue p0.s
277-
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8
278-
; CHECK-NEXT: and z1.s, z1.s, #0x1
279-
; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0
278+
; CHECK-NEXT: mov z0.d, z1.d
279+
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
280+
; CHECK-NEXT: and z0.s, z0.s, #0x1
281+
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
280282
; CHECK-NEXT: ret
281283
%res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 2)
282284
ret <vscale x 4 x i1> %res
@@ -286,12 +288,13 @@ define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x
286288
define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) #0 {
287289
; CHECK-LABEL: splice_nxv8i1_idx:
288290
; CHECK: // %bb.0:
289-
; CHECK-NEXT: mov z0.h, p1/z, #1 // =0x1
290291
; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1
292+
; CHECK-NEXT: mov z0.h, p1/z, #1 // =0x1
291293
; CHECK-NEXT: ptrue p0.h
292-
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8
293-
; CHECK-NEXT: and z1.h, z1.h, #0x1
294-
; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0
294+
; CHECK-NEXT: mov z0.d, z1.d
295+
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
296+
; CHECK-NEXT: and z0.h, z0.h, #0x1
297+
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
295298
; CHECK-NEXT: ret
296299
%res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 4)
297300
ret <vscale x 8 x i1> %res
@@ -301,12 +304,13 @@ define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x
301304
define <vscale x 16 x i1> @splice_nxv16i1_idx(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
302305
; CHECK-LABEL: splice_nxv16i1_idx:
303306
; CHECK: // %bb.0:
304-
; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1
305307
; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1
308+
; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1
306309
; CHECK-NEXT: ptrue p0.b
307-
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8
308-
; CHECK-NEXT: and z1.b, z1.b, #0x1
309-
; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0
310+
; CHECK-NEXT: mov z0.d, z1.d
311+
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
312+
; CHECK-NEXT: and z0.b, z0.b, #0x1
313+
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
310314
; CHECK-NEXT: ret
311315
%res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 8)
312316
ret <vscale x 16 x i1> %res

llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ define void @extract_v32i8_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(
5050
; CHECK-LABEL: extract_v32i8_halves:
5151
; CHECK: // %bb.0: // %entry
5252
; CHECK-NEXT: ldr z0, [x0]
53-
; CHECK-NEXT: mov z1.d, z0.d
53+
; CHECK-NEXT: movprfx z1, z0
5454
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
5555
; CHECK-NEXT: str q1, [x1]
5656
; CHECK-NEXT: str q0, [x2]
@@ -68,7 +68,7 @@ define void @extract_v32i8_half_unaligned(ptr %in, ptr %out) #0 vscale_range(2,2
6868
; CHECK-LABEL: extract_v32i8_half_unaligned:
6969
; CHECK: // %bb.0: // %entry
7070
; CHECK-NEXT: ldr z0, [x0]
71-
; CHECK-NEXT: mov z1.d, z0.d
71+
; CHECK-NEXT: movprfx z1, z0
7272
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
7373
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4
7474
; CHECK-NEXT: str q0, [x1]
@@ -84,15 +84,16 @@ define void @extract_v32i8_quarters(ptr %in, ptr %out, ptr %out2, ptr %out3, ptr
8484
; CHECK-LABEL: extract_v32i8_quarters:
8585
; CHECK: // %bb.0: // %entry
8686
; CHECK-NEXT: ldr z0, [x0]
87-
; CHECK-NEXT: mov z1.d, z0.d
88-
; CHECK-NEXT: mov z2.d, z0.d
87+
; CHECK-NEXT: movprfx z1, z0
8988
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
89+
; CHECK-NEXT: movprfx z2, z0
9090
; CHECK-NEXT: ext z2.b, z2.b, z0.b, #24
91+
; CHECK-NEXT: movprfx z3, z0
92+
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
9193
; CHECK-NEXT: str d1, [x1]
9294
; CHECK-NEXT: str d2, [x2]
9395
; CHECK-NEXT: str d0, [x3]
94-
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
95-
; CHECK-NEXT: str d0, [x4]
96+
; CHECK-NEXT: str d3, [x4]
9697
; CHECK-NEXT: ret
9798
entry:
9899
%b = load <32 x i8>, ptr %in
@@ -126,7 +127,7 @@ define void @extract_v64i8_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(
126127
; CHECK: // %bb.0: // %entry
127128
; CHECK-NEXT: ldr z0, [x0]
128129
; CHECK-NEXT: ptrue p0.b, vl32
129-
; CHECK-NEXT: mov z1.d, z0.d
130+
; CHECK-NEXT: movprfx z1, z0
130131
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32
131132
; CHECK-NEXT: st1b { z1.b }, p0, [x1]
132133
; CHECK-NEXT: st1b { z0.b }, p0, [x2]
@@ -207,7 +208,7 @@ define void @extract_v16i16_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range
207208
; CHECK-LABEL: extract_v16i16_halves:
208209
; CHECK: // %bb.0: // %entry
209210
; CHECK-NEXT: ldr z0, [x0]
210-
; CHECK-NEXT: mov z1.d, z0.d
211+
; CHECK-NEXT: movprfx z1, z0
211212
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
212213
; CHECK-NEXT: str q1, [x1]
213214
; CHECK-NEXT: str q0, [x2]
@@ -240,7 +241,7 @@ define void @extract_v32i16_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range
240241
; CHECK: // %bb.0: // %entry
241242
; CHECK-NEXT: ldr z0, [x0]
242243
; CHECK-NEXT: ptrue p0.h, vl16
243-
; CHECK-NEXT: mov z1.d, z0.d
244+
; CHECK-NEXT: movprfx z1, z0
244245
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32
245246
; CHECK-NEXT: st1h { z1.h }, p0, [x1]
246247
; CHECK-NEXT: st1h { z0.h }, p0, [x2]
@@ -322,7 +323,7 @@ define void @extract_v8i32_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(
322323
; CHECK-LABEL: extract_v8i32_halves:
323324
; CHECK: // %bb.0: // %entry
324325
; CHECK-NEXT: ldr z0, [x0]
325-
; CHECK-NEXT: mov z1.d, z0.d
326+
; CHECK-NEXT: movprfx z1, z0
326327
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
327328
; CHECK-NEXT: str q1, [x1]
328329
; CHECK-NEXT: str q0, [x2]
@@ -355,7 +356,7 @@ define void @extract_v16i32_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range
355356
; CHECK: // %bb.0: // %entry
356357
; CHECK-NEXT: ldr z0, [x0]
357358
; CHECK-NEXT: ptrue p0.s, vl8
358-
; CHECK-NEXT: mov z1.d, z0.d
359+
; CHECK-NEXT: movprfx z1, z0
359360
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32
360361
; CHECK-NEXT: st1w { z1.s }, p0, [x1]
361362
; CHECK-NEXT: st1w { z0.s }, p0, [x2]
@@ -426,7 +427,7 @@ define void @extract_v4i64_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(
426427
; CHECK-LABEL: extract_v4i64_halves:
427428
; CHECK: // %bb.0: // %entry
428429
; CHECK-NEXT: ldr z0, [x0]
429-
; CHECK-NEXT: mov z1.d, z0.d
430+
; CHECK-NEXT: movprfx z1, z0
430431
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
431432
; CHECK-NEXT: str q1, [x1]
432433
; CHECK-NEXT: str q0, [x2]
@@ -459,7 +460,7 @@ define void @extract_v8i64_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(
459460
; CHECK: // %bb.0: // %entry
460461
; CHECK-NEXT: ldr z0, [x0]
461462
; CHECK-NEXT: ptrue p0.d, vl4
462-
; CHECK-NEXT: mov z1.d, z0.d
463+
; CHECK-NEXT: movprfx z1, z0
463464
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32
464465
; CHECK-NEXT: st1d { z1.d }, p0, [x1]
465466
; CHECK-NEXT: st1d { z0.d }, p0, [x2]
@@ -553,7 +554,7 @@ define void @extract_v16half_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_rang
553554
; CHECK-LABEL: extract_v16half_halves:
554555
; CHECK: // %bb.0: // %entry
555556
; CHECK-NEXT: ldr z0, [x0]
556-
; CHECK-NEXT: mov z1.d, z0.d
557+
; CHECK-NEXT: movprfx z1, z0
557558
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
558559
; CHECK-NEXT: str q1, [x1]
559560
; CHECK-NEXT: str q0, [x2]
@@ -586,7 +587,7 @@ define void @extract_v32half_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_rang
586587
; CHECK: // %bb.0: // %entry
587588
; CHECK-NEXT: ldr z0, [x0]
588589
; CHECK-NEXT: ptrue p0.h, vl16
589-
; CHECK-NEXT: mov z1.d, z0.d
590+
; CHECK-NEXT: movprfx z1, z0
590591
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32
591592
; CHECK-NEXT: st1h { z1.h }, p0, [x1]
592593
; CHECK-NEXT: st1h { z0.h }, p0, [x2]
@@ -668,7 +669,7 @@ define void @extract_v8float_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_rang
668669
; CHECK-LABEL: extract_v8float_halves:
669670
; CHECK: // %bb.0: // %entry
670671
; CHECK-NEXT: ldr z0, [x0]
671-
; CHECK-NEXT: mov z1.d, z0.d
672+
; CHECK-NEXT: movprfx z1, z0
672673
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
673674
; CHECK-NEXT: str q1, [x1]
674675
; CHECK-NEXT: str q0, [x2]
@@ -701,7 +702,7 @@ define void @extract_v16float_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_ran
701702
; CHECK: // %bb.0: // %entry
702703
; CHECK-NEXT: ldr z0, [x0]
703704
; CHECK-NEXT: ptrue p0.s, vl8
704-
; CHECK-NEXT: mov z1.d, z0.d
705+
; CHECK-NEXT: movprfx z1, z0
705706
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32
706707
; CHECK-NEXT: st1w { z1.s }, p0, [x1]
707708
; CHECK-NEXT: st1w { z0.s }, p0, [x2]
@@ -772,7 +773,7 @@ define void @extract_v4double_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_ran
772773
; CHECK-LABEL: extract_v4double_halves:
773774
; CHECK: // %bb.0: // %entry
774775
; CHECK-NEXT: ldr z0, [x0]
775-
; CHECK-NEXT: mov z1.d, z0.d
776+
; CHECK-NEXT: movprfx z1, z0
776777
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
777778
; CHECK-NEXT: str q1, [x1]
778779
; CHECK-NEXT: str q0, [x2]
@@ -805,7 +806,7 @@ define void @extract_v8double_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_ran
805806
; CHECK: // %bb.0: // %entry
806807
; CHECK-NEXT: ldr z0, [x0]
807808
; CHECK-NEXT: ptrue p0.d, vl4
808-
; CHECK-NEXT: mov z1.d, z0.d
809+
; CHECK-NEXT: movprfx z1, z0
809810
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32
810811
; CHECK-NEXT: st1d { z1.d }, p0, [x1]
811812
; CHECK-NEXT: st1d { z0.d }, p0, [x2]
@@ -908,7 +909,7 @@ define void @extract_subvector_legalization_v8i32() vscale_range(2,2) #0 {
908909
; CHECK-NEXT: add x8, x8, :lo12:.LCPI59_0
909910
; CHECK-NEXT: ptrue p1.d
910911
; CHECK-NEXT: ldr z0, [x8]
911-
; CHECK-NEXT: mov z1.d, z0.d
912+
; CHECK-NEXT: movprfx z1, z0
912913
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
913914
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
914915
; CHECK-NEXT: cmeq v1.4s, v1.4s, #0

llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -150,13 +150,14 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) #0 {
150150
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
151151
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
152152
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
153-
; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
154-
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
153+
; VBITS_GE_256-NEXT: movprfx z1, z0
154+
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16
155155
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
156-
; VBITS_GE_256-NEXT: fcvtzu z1.s, p0/m, z1.h
156+
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
157157
; VBITS_GE_256-NEXT: fcvtzu z0.s, p0/m, z0.h
158-
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
159-
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
158+
; VBITS_GE_256-NEXT: fcvtzu z1.s, p0/m, z1.h
159+
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
160+
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
160161
; VBITS_GE_256-NEXT: ret
161162
;
162163
; VBITS_GE_512-LABEL: fcvtzu_v16f16_v16i32:
@@ -551,13 +552,14 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) #0 {
551552
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
552553
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
553554
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
554-
; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
555-
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
555+
; VBITS_GE_256-NEXT: movprfx z1, z0
556+
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16
556557
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
557-
; VBITS_GE_256-NEXT: fcvtzu z1.d, p0/m, z1.s
558+
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
558559
; VBITS_GE_256-NEXT: fcvtzu z0.d, p0/m, z0.s
559-
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
560-
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
560+
; VBITS_GE_256-NEXT: fcvtzu z1.d, p0/m, z1.s
561+
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
562+
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
561563
; VBITS_GE_256-NEXT: ret
562564
;
563565
; VBITS_GE_512-LABEL: fcvtzu_v8f32_v8i64:
@@ -1043,13 +1045,14 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) #0 {
10431045
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
10441046
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
10451047
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1046-
; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
1047-
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1048+
; VBITS_GE_256-NEXT: movprfx z1, z0
1049+
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16
10481050
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
1049-
; VBITS_GE_256-NEXT: fcvtzs z1.s, p0/m, z1.h
1051+
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
10501052
; VBITS_GE_256-NEXT: fcvtzs z0.s, p0/m, z0.h
1051-
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
1052-
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
1053+
; VBITS_GE_256-NEXT: fcvtzs z1.s, p0/m, z1.h
1054+
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
1055+
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
10531056
; VBITS_GE_256-NEXT: ret
10541057
;
10551058
; VBITS_GE_512-LABEL: fcvtzs_v16f16_v16i32:
@@ -1444,13 +1447,14 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) #0 {
14441447
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
14451448
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
14461449
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1447-
; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
1448-
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1450+
; VBITS_GE_256-NEXT: movprfx z1, z0
1451+
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16
14491452
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
1450-
; VBITS_GE_256-NEXT: fcvtzs z1.d, p0/m, z1.s
1453+
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
14511454
; VBITS_GE_256-NEXT: fcvtzs z0.d, p0/m, z0.s
1452-
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
1453-
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
1455+
; VBITS_GE_256-NEXT: fcvtzs z1.d, p0/m, z1.s
1456+
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
1457+
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
14541458
; VBITS_GE_256-NEXT: ret
14551459
;
14561460
; VBITS_GE_512-LABEL: fcvtzs_v8f32_v8i64:

0 commit comments

Comments
 (0)