@@ -37,6 +37,10 @@ define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
3737; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
3838; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
3939; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
40+ ; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
41+ ; CHECK-NEXT: ptrue p1.h
42+ ; CHECK-NEXT: and z2.h, z2.h, #0x1
43+ ; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
4044; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
4145; CHECK-NEXT: st1h { z0.h }, p0, [x0]
4246; CHECK-NEXT: ret
@@ -59,8 +63,15 @@ define void @select_v32f16(ptr %a, ptr %b) #0 {
5963; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
6064; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
6165; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z2.h, z3.h
62- ; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h
63- ; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h
66+ ; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff
67+ ; VBITS_GE_256-NEXT: ptrue p1.h
68+ ; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff
69+ ; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1
70+ ; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1
71+ ; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0
72+ ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0
73+ ; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h
74+ ; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h
6475; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
6576; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
6677; VBITS_GE_256-NEXT: ret
@@ -71,6 +82,10 @@ define void @select_v32f16(ptr %a, ptr %b) #0 {
7182; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
7283; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
7384; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
85+ ; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
86+ ; VBITS_GE_512-NEXT: ptrue p1.h
87+ ; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1
88+ ; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0
7489; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h
7590; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
7691; VBITS_GE_512-NEXT: ret
@@ -89,6 +104,10 @@ define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
89104; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
90105; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
91106; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
107+ ; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
108+ ; CHECK-NEXT: ptrue p1.h
109+ ; CHECK-NEXT: and z2.h, z2.h, #0x1
110+ ; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
92111; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
93112; CHECK-NEXT: st1h { z0.h }, p0, [x0]
94113; CHECK-NEXT: ret
@@ -107,6 +126,10 @@ define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
107126; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
108127; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
109128; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
129+ ; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
130+ ; CHECK-NEXT: ptrue p1.h
131+ ; CHECK-NEXT: and z2.h, z2.h, #0x1
132+ ; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
110133; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
111134; CHECK-NEXT: st1h { z0.h }, p0, [x0]
112135; CHECK-NEXT: ret
@@ -150,6 +173,10 @@ define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
150173; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
151174; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
152175; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
176+ ; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
177+ ; CHECK-NEXT: ptrue p1.s
178+ ; CHECK-NEXT: and z2.s, z2.s, #0x1
179+ ; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
153180; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
154181; CHECK-NEXT: st1w { z0.s }, p0, [x0]
155182; CHECK-NEXT: ret
@@ -172,8 +199,15 @@ define void @select_v16f32(ptr %a, ptr %b) #0 {
172199; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
173200; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
174201; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s
175- ; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s
176- ; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s
202+ ; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff
203+ ; VBITS_GE_256-NEXT: ptrue p1.s
204+ ; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff
205+ ; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1
206+ ; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1
207+ ; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0
208+ ; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0
209+ ; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s
210+ ; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s
177211; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
178212; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
179213; VBITS_GE_256-NEXT: ret
@@ -184,6 +218,10 @@ define void @select_v16f32(ptr %a, ptr %b) #0 {
184218; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
185219; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
186220; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
221+ ; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
222+ ; VBITS_GE_512-NEXT: ptrue p1.s
223+ ; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1
224+ ; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0
187225; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s
188226; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
189227; VBITS_GE_512-NEXT: ret
@@ -202,6 +240,10 @@ define void @select_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
202240; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
203241; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
204242; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
243+ ; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
244+ ; CHECK-NEXT: ptrue p1.s
245+ ; CHECK-NEXT: and z2.s, z2.s, #0x1
246+ ; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
205247; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
206248; CHECK-NEXT: st1w { z0.s }, p0, [x0]
207249; CHECK-NEXT: ret
@@ -220,6 +262,10 @@ define void @select_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
220262; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
221263; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
222264; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
265+ ; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
266+ ; CHECK-NEXT: ptrue p1.s
267+ ; CHECK-NEXT: and z2.s, z2.s, #0x1
268+ ; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
223269; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
224270; CHECK-NEXT: st1w { z0.s }, p0, [x0]
225271; CHECK-NEXT: ret
@@ -264,6 +310,10 @@ define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
264310; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
265311; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
266312; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
313+ ; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
314+ ; CHECK-NEXT: ptrue p1.d
315+ ; CHECK-NEXT: and z2.d, z2.d, #0x1
316+ ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
267317; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
268318; CHECK-NEXT: st1d { z0.d }, p0, [x0]
269319; CHECK-NEXT: ret
@@ -286,8 +336,15 @@ define void @select_v8f64(ptr %a, ptr %b) #0 {
286336; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
287337; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
288338; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d
289- ; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d
290- ; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d
339+ ; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff
340+ ; VBITS_GE_256-NEXT: ptrue p1.d
341+ ; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff
342+ ; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1
343+ ; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1
344+ ; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0
345+ ; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0
346+ ; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d
347+ ; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d
291348; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
292349; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
293350; VBITS_GE_256-NEXT: ret
@@ -298,6 +355,10 @@ define void @select_v8f64(ptr %a, ptr %b) #0 {
298355; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
299356; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
300357; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
358+ ; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
359+ ; VBITS_GE_512-NEXT: ptrue p1.d
360+ ; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1
361+ ; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0
301362; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
302363; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
303364; VBITS_GE_512-NEXT: ret
@@ -316,6 +377,10 @@ define void @select_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
316377; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
317378; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
318379; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
380+ ; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
381+ ; CHECK-NEXT: ptrue p1.d
382+ ; CHECK-NEXT: and z2.d, z2.d, #0x1
383+ ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
319384; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
320385; CHECK-NEXT: st1d { z0.d }, p0, [x0]
321386; CHECK-NEXT: ret
@@ -334,6 +399,10 @@ define void @select_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
334399; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
335400; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
336401; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
402+ ; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
403+ ; CHECK-NEXT: ptrue p1.d
404+ ; CHECK-NEXT: and z2.d, z2.d, #0x1
405+ ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
337406; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
338407; CHECK-NEXT: st1d { z0.d }, p0, [x0]
339408; CHECK-NEXT: ret
0 commit comments