@@ -93,57 +93,36 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <
9393; CHECK-LABEL: test_insert_vector:
9494; CHECK: .p2align 4
9595; CHECK-NEXT: // %bb.0: // %entry
96- ; CHECK-NEXT: nopa ; nopb ; nopx ; mov r24, r16; nops
97- ; CHECK-NEXT: mov r25, r17
98- ; CHECK-NEXT: mov r26, r18
99- ; CHECK-NEXT: mov r27, r19
100- ; CHECK-NEXT: mova r19, #0
101- ; CHECK-NEXT: mova r18, #1
102- ; CHECK-NEXT: mova r17, #2
103- ; CHECK-NEXT: mova r16, #3
104- ; CHECK-NEXT: vextract.s32 r4, x4, r16
105- ; CHECK-NEXT: mova r16, #4
106- ; CHECK-NEXT: vextract.s32 r1, x4, r19
107- ; CHECK-NEXT: vextract.s32 r2, x4, r18
108- ; CHECK-NEXT: vextract.s32 r3, x4, r17
109- ; CHECK-NEXT: vextract.s32 r5, x4, r16
110- ; CHECK-NEXT: mova r16, #5
111- ; CHECK-NEXT: vextract.s32 r6, x4, r16
112- ; CHECK-NEXT: mova r16, #7
113- ; CHECK-NEXT: vextract.s32 r7, x4, r16
114- ; CHECK-NEXT: mova r16, #6
115- ; CHECK-NEXT: vextract.s32 r8, x4, r16
116- ; CHECK-NEXT: vpush.lo.32 x0, r7, x0
117- ; CHECK-NEXT: vpush.lo.32 x0, r8, x0
118- ; CHECK-NEXT: vpush.lo.32 x0, r6, x0
119- ; CHECK-NEXT: jz r0, #.LBB1_2
120- ; CHECK-NEXT: vpush.lo.32 x0, r5, x0 // Delay Slot 5
121- ; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
122- ; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 3
123- ; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 2
124- ; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 1
96+ ; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv
97+ ; CHECK-NEXT: nopx // Delay Slot 5
98+ ; CHECK-NEXT: nop // Delay Slot 4
99+ ; CHECK-NEXT: nop // Delay Slot 3
100+ ; CHECK-NEXT: mov r24, r16 // Delay Slot 2
101+ ; CHECK-NEXT: mova r16, #0 // Delay Slot 1
125102; CHECK-NEXT: // %bb.1: // %if.end
126- ; CHECK-NEXT: mova r16, #3; nopxm
127- ; CHECK-NEXT: vextract.s32 r0, x2, r19
128- ; CHECK-NEXT: vextract.s32 r1, x0, r19
129- ; CHECK-NEXT: vextract.s32 r2, x2, r18
130- ; CHECK-NEXT: vextract.s32 r3, x0, r18
131- ; CHECK-NEXT: vextract.s32 r4, x2, r17
132- ; CHECK-NEXT: vextract.s32 r5, x0, r17
103+ ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x2, r16; nopv
104+ ; CHECK-NEXT: vextract.s32 r1, x4, r16
105+ ; CHECK-NEXT: mova r16, #1
106+ ; CHECK-NEXT: vextract.s32 r2, x2, r16
107+ ; CHECK-NEXT: vextract.s32 r3, x4, r16
108+ ; CHECK-NEXT: mova r16, #2
109+ ; CHECK-NEXT: vextract.s32 r4, x2, r16
110+ ; CHECK-NEXT: vextract.s32 r5, x4, r16
111+ ; CHECK-NEXT: mova r16, #3
133112; CHECK-NEXT: vextract.s32 r6, x2, r16
134- ; CHECK-NEXT: vextract.s32 r7, x0 , r16
113+ ; CHECK-NEXT: vextract.s32 r7, x4 , r16
135114; CHECK-NEXT: mova r16, #4
136115; CHECK-NEXT: vextract.s32 r8, x2, r16
137- ; CHECK-NEXT: vextract.s32 r9, x0 , r16
116+ ; CHECK-NEXT: vextract.s32 r9, x4 , r16
138117; CHECK-NEXT: mova r16, #5
139118; CHECK-NEXT: vextract.s32 r10, x2, r16
140- ; CHECK-NEXT: vextract.s32 r11, x0 , r16
119+ ; CHECK-NEXT: vextract.s32 r11, x4 , r16
141120; CHECK-NEXT: mova r16, #7
142121; CHECK-NEXT: vextract.s32 r12, x2, r16
143- ; CHECK-NEXT: vextract.s32 r13, x0 , r16
122+ ; CHECK-NEXT: vextract.s32 r13, x4 , r16
144123; CHECK-NEXT: mova r16, #6
145124; CHECK-NEXT: vextract.s32 r14, x2, r16
146- ; CHECK-NEXT: vextract.s32 r15, x0 , r16
125+ ; CHECK-NEXT: vextract.s32 r15, x4 , r16
147126; CHECK-NEXT: vpush.lo.32 x0, r13, x0
148127; CHECK-NEXT: vpush.lo.32 x0, r15, x0
149128; CHECK-NEXT: vpush.lo.32 x0, r11, x0
@@ -155,34 +134,37 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <
155134; CHECK-NEXT: vpush.lo.32 x0, r12, x0
156135; CHECK-NEXT: vpush.lo.32 x0, r14, x0
157136; CHECK-NEXT: vpush.lo.32 x0, r10, x0
158- ; CHECK-NEXT: j #.LBB1_3
159- ; CHECK-NEXT: vpush.lo.32 x0, r8, x0 // Delay Slot 5
160- ; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 4
161- ; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 3
162- ; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 2
163- ; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 1
137+ ; CHECK-NEXT: vpush.lo.32 x0, r8, x0
138+ ; CHECK-NEXT: ret lr
139+ ; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5
140+ ; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
141+ ; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3
142+ ; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2
143+ ; CHECK-NEXT: mov r16, r24 // Delay Slot 1
164144; CHECK-NEXT: .p2align 4
165145; CHECK-NEXT: .LBB1_2: // %if.then
166- ; CHECK-NEXT: nopb ; mova r16, #3; nops ; nopxm ; nopv
167- ; CHECK-NEXT: vextract.s32 r0, x0, r19
168- ; CHECK-NEXT: vextract.s32 r1, x2, r19
169- ; CHECK-NEXT: vextract.s32 r2, x0, r18
170- ; CHECK-NEXT: vextract.s32 r3, x2, r18
171- ; CHECK-NEXT: vextract.s32 r4, x0, r17
172- ; CHECK-NEXT: vextract.s32 r5, x2, r17
173- ; CHECK-NEXT: vextract.s32 r6, x0, r16
146+ ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x4, r16; nopv
147+ ; CHECK-NEXT: vextract.s32 r1, x2, r16
148+ ; CHECK-NEXT: mova r16, #1
149+ ; CHECK-NEXT: vextract.s32 r2, x4, r16
150+ ; CHECK-NEXT: vextract.s32 r3, x2, r16
151+ ; CHECK-NEXT: mova r16, #2
152+ ; CHECK-NEXT: vextract.s32 r4, x4, r16
153+ ; CHECK-NEXT: vextract.s32 r5, x2, r16
154+ ; CHECK-NEXT: mova r16, #3
155+ ; CHECK-NEXT: vextract.s32 r6, x4, r16
174156; CHECK-NEXT: vextract.s32 r7, x2, r16
175157; CHECK-NEXT: mova r16, #4
176- ; CHECK-NEXT: vextract.s32 r8, x0 , r16
158+ ; CHECK-NEXT: vextract.s32 r8, x4 , r16
177159; CHECK-NEXT: vextract.s32 r9, x2, r16
178160; CHECK-NEXT: mova r16, #5
179- ; CHECK-NEXT: vextract.s32 r10, x0 , r16
161+ ; CHECK-NEXT: vextract.s32 r10, x4 , r16
180162; CHECK-NEXT: vextract.s32 r11, x2, r16
181163; CHECK-NEXT: mova r16, #7
182- ; CHECK-NEXT: vextract.s32 r12, x0 , r16
164+ ; CHECK-NEXT: vextract.s32 r12, x4 , r16
183165; CHECK-NEXT: vextract.s32 r13, x2, r16
184166; CHECK-NEXT: mova r16, #6
185- ; CHECK-NEXT: vextract.s32 r14, x0 , r16
167+ ; CHECK-NEXT: vextract.s32 r14, x4 , r16
186168; CHECK-NEXT: vextract.s32 r15, x2, r16
187169; CHECK-NEXT: vpush.lo.32 x0, r13, x0
188170; CHECK-NEXT: vpush.lo.32 x0, r15, x0
@@ -196,17 +178,11 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <
196178; CHECK-NEXT: vpush.lo.32 x0, r14, x0
197179; CHECK-NEXT: vpush.lo.32 x0, r10, x0
198180; CHECK-NEXT: vpush.lo.32 x0, r8, x0
199- ; CHECK-NEXT: vpush.lo.32 x0, r6, x0
200- ; CHECK-NEXT: vpush.lo.32 x0, r4, x0
201- ; CHECK-NEXT: vpush.lo.32 x0, r2, x0
202- ; CHECK-NEXT: vpush.lo.32 x0, r0, x0
203- ; CHECK-NEXT: .p2align 4
204- ; CHECK-NEXT: .LBB1_3: // %cleanup
205- ; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops
206- ; CHECK-NEXT: nop // Delay Slot 5
207- ; CHECK-NEXT: mov r19, r27 // Delay Slot 4
208- ; CHECK-NEXT: mov r18, r26 // Delay Slot 3
209- ; CHECK-NEXT: mov r17, r25 // Delay Slot 2
181+ ; CHECK-NEXT: ret lr
182+ ; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5
183+ ; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
184+ ; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3
185+ ; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2
210186; CHECK-NEXT: mov r16, r24 // Delay Slot 1
211187entry:
212188 %shuffle = shufflevector <8 x i32 > %b , <8 x i32 > undef , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef >
@@ -230,49 +206,12 @@ define <16 x i32> @test_concat_vector(<8 x i32> noundef %a, <8 x i32> noundef %b
230206; CHECK-LABEL: test_concat_vector:
231207; CHECK: .p2align 4
232208; CHECK-NEXT: // %bb.0: // %entry
233- ; CHECK-NEXT: nopx ; mov r24, r16
234- ; CHECK-NEXT: mova r16, #0
235- ; CHECK-NEXT: vextract.s32 r0, x2, r16
236- ; CHECK-NEXT: vextract.s32 r1, x4, r16
237- ; CHECK-NEXT: mova r16, #1
238- ; CHECK-NEXT: vextract.s32 r2, x2, r16
239- ; CHECK-NEXT: vextract.s32 r3, x4, r16
240- ; CHECK-NEXT: mova r16, #2
241- ; CHECK-NEXT: vextract.s32 r4, x2, r16
242- ; CHECK-NEXT: vextract.s32 r5, x4, r16
243- ; CHECK-NEXT: mova r16, #3
244- ; CHECK-NEXT: vextract.s32 r6, x2, r16
245- ; CHECK-NEXT: vextract.s32 r7, x4, r16
246- ; CHECK-NEXT: mova r16, #4
247- ; CHECK-NEXT: vextract.s32 r8, x2, r16
248- ; CHECK-NEXT: vextract.s32 r9, x4, r16
249- ; CHECK-NEXT: mova r16, #5
250- ; CHECK-NEXT: vextract.s32 r10, x2, r16
251- ; CHECK-NEXT: vextract.s32 r11, x4, r16
252- ; CHECK-NEXT: mova r16, #7
253- ; CHECK-NEXT: vextract.s32 r12, x2, r16
254- ; CHECK-NEXT: vextract.s32 r13, x4, r16
255- ; CHECK-NEXT: mova r16, #6
256- ; CHECK-NEXT: vextract.s32 r14, x2, r16
257- ; CHECK-NEXT: vextract.s32 r15, x4, r16
258- ; CHECK-NEXT: vpush.lo.32 x0, r13, x0
259- ; CHECK-NEXT: vpush.lo.32 x0, r15, x0
260- ; CHECK-NEXT: vpush.lo.32 x0, r11, x0
261- ; CHECK-NEXT: vpush.lo.32 x0, r9, x0
262- ; CHECK-NEXT: vpush.lo.32 x0, r7, x0
263- ; CHECK-NEXT: vpush.lo.32 x0, r5, x0
264- ; CHECK-NEXT: vpush.lo.32 x0, r3, x0
265- ; CHECK-NEXT: vpush.lo.32 x0, r1, x0
266- ; CHECK-NEXT: vpush.lo.32 x0, r12, x0
267- ; CHECK-NEXT: vpush.lo.32 x0, r14, x0
268- ; CHECK-NEXT: vpush.lo.32 x0, r10, x0
269- ; CHECK-NEXT: vpush.lo.32 x0, r8, x0
270- ; CHECK-NEXT: ret lr
271- ; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5
272- ; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
273- ; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3
274- ; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2
275- ; CHECK-NEXT: mov r16, r24 // Delay Slot 1
209+ ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv
210+ ; CHECK-NEXT: nopx // Delay Slot 5
211+ ; CHECK-NEXT: nop // Delay Slot 4
212+ ; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 3
213+ ; CHECK-NEXT: vmov wh0, wl4 // Delay Slot 2
214+ ; CHECK-NEXT: nop // Delay Slot 1
276215entry:
277216 %shuffle = shufflevector <8 x i32 > %a , <8 x i32 > %b , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 12 , i32 13 , i32 14 , i32 15 >
278217 ret <16 x i32 > %shuffle
@@ -282,44 +221,14 @@ define <16 x i32> @test_set_vector(i32 noundef %idx, <8 x i32> noundef %a) {
282221; CHECK-LABEL: test_set_vector:
283222; CHECK: .p2align 4
284223; CHECK-NEXT: // %bb.0: // %entry
285- ; CHECK-NEXT: nopa ; nopb ; nopx ; mov r9, r16
286- ; CHECK-NEXT: mova r16, #0
224+ ; CHECK-NEXT: mov r1, r16
287225; CHECK-NEXT: eqz r0, r0
288- ; CHECK-NEXT: vextract.s32 r1, x2, r16
289- ; CHECK-NEXT: mova r16, #1
290- ; CHECK-NEXT: vextract.s32 r2, x2, r16
291- ; CHECK-NEXT: mova r16, #2
292- ; CHECK-NEXT: vextract.s32 r3, x2, r16
293- ; CHECK-NEXT: mova r16, #3
294- ; CHECK-NEXT: vextract.s32 r4, x2, r16
295- ; CHECK-NEXT: mova r16, #4
296- ; CHECK-NEXT: vextract.s32 r5, x2, r16
297- ; CHECK-NEXT: mova r16, #5
298- ; CHECK-NEXT: vextract.s32 r6, x2, r16
299- ; CHECK-NEXT: mova r16, #7
300- ; CHECK-NEXT: vextract.s32 r7, x2, r16
301- ; CHECK-NEXT: mova r16, #6
302- ; CHECK-NEXT: vextract.s32 r8, x2, r16
303- ; CHECK-NEXT: add r16, r0, #-1
304- ; CHECK-NEXT: vpush.lo.32 x0, r7, x0
305- ; CHECK-NEXT: vpush.lo.32 x0, r8, x0
306- ; CHECK-NEXT: vpush.lo.32 x0, r6, x0
307- ; CHECK-NEXT: vpush.lo.32 x0, r5, x0
308- ; CHECK-NEXT: vpush.lo.32 x0, r4, x0
309- ; CHECK-NEXT: vpush.lo.32 x0, r3, x0
310- ; CHECK-NEXT: vpush.lo.32 x0, r2, x0
311- ; CHECK-NEXT: vpush.lo.32 x0, r1, x0
312- ; CHECK-NEXT: vpush.lo.32 x2, r0, x0
313- ; CHECK-NEXT: vpush.lo.32 x2, r0, x2
314- ; CHECK-NEXT: vpush.lo.32 x2, r0, x2
315- ; CHECK-NEXT: vpush.lo.32 x2, r0, x2
316- ; CHECK-NEXT: vpush.lo.32 x2, r0, x2
317226; CHECK-NEXT: ret lr
318- ; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 5
319- ; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 4
320- ; CHECK-NEXT: vpush.lo.32 x2 , r0, x2 // Delay Slot 3
321- ; CHECK-NEXT: vsel.32 x0, x0, x2 , r16 // Delay Slot 2
322- ; CHECK-NEXT: mov r16, r9 // Delay Slot 1
227+ ; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 5
228+ ; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 4
229+ ; CHECK-NEXT: add r16 , r0, #-1 // Delay Slot 3
230+ ; CHECK-NEXT: vsel.32 x0, x0, x0 , r16 // Delay Slot 2
231+ ; CHECK-NEXT: mov r16, r1 // Delay Slot 1
323232entry:
324233 %cmp = icmp eq i32 %idx , 0
325234 %shuffle = shufflevector <8 x i32 > %a , <8 x i32 > undef , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef >
0 commit comments