Skip to content

Commit 5a53fd6

Browse files
[AIE2] Enable G_CONCAT_VECTOR optimizations for AIE2
1 parent 562ccea commit 5a53fd6

File tree

2 files changed

+62
-149
lines changed

2 files changed

+62
-149
lines changed

llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
2323
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
2424
#include "llvm/CodeGen/MachineDominators.h"
25+
#include "llvm/CodeGen/TargetOpcodes.h"
2526
#include "llvm/IR/IntrinsicsAIE2.h"
2627
#include "llvm/InitializePasses.h"
2728

@@ -167,6 +168,9 @@ bool AIE2PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
167168
case TargetOpcode::G_INTRINSIC: {
168169
return tryToCombineIntrinsic(MI);
169170
}
171+
case TargetOpcode::G_SHUFFLE_VECTOR: {
172+
return Helper.tryCombineShuffleVector(MI);
173+
}
170174
default:
171175
break;
172176
}

llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll

Lines changed: 58 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -93,57 +93,36 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <
9393
; CHECK-LABEL: test_insert_vector:
9494
; CHECK: .p2align 4
9595
; CHECK-NEXT: // %bb.0: // %entry
96-
; CHECK-NEXT: nopa ; nopb ; nopx ; mov r24, r16; nops
97-
; CHECK-NEXT: mov r25, r17
98-
; CHECK-NEXT: mov r26, r18
99-
; CHECK-NEXT: mov r27, r19
100-
; CHECK-NEXT: mova r19, #0
101-
; CHECK-NEXT: mova r18, #1
102-
; CHECK-NEXT: mova r17, #2
103-
; CHECK-NEXT: mova r16, #3
104-
; CHECK-NEXT: vextract.s32 r4, x4, r16
105-
; CHECK-NEXT: mova r16, #4
106-
; CHECK-NEXT: vextract.s32 r1, x4, r19
107-
; CHECK-NEXT: vextract.s32 r2, x4, r18
108-
; CHECK-NEXT: vextract.s32 r3, x4, r17
109-
; CHECK-NEXT: vextract.s32 r5, x4, r16
110-
; CHECK-NEXT: mova r16, #5
111-
; CHECK-NEXT: vextract.s32 r6, x4, r16
112-
; CHECK-NEXT: mova r16, #7
113-
; CHECK-NEXT: vextract.s32 r7, x4, r16
114-
; CHECK-NEXT: mova r16, #6
115-
; CHECK-NEXT: vextract.s32 r8, x4, r16
116-
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
117-
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
118-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0
119-
; CHECK-NEXT: jz r0, #.LBB1_2
120-
; CHECK-NEXT: vpush.lo.32 x0, r5, x0 // Delay Slot 5
121-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
122-
; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 3
123-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 2
124-
; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 1
96+
; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv
97+
; CHECK-NEXT: nopx // Delay Slot 5
98+
; CHECK-NEXT: nop // Delay Slot 4
99+
; CHECK-NEXT: nop // Delay Slot 3
100+
; CHECK-NEXT: mov r24, r16 // Delay Slot 2
101+
; CHECK-NEXT: mova r16, #0 // Delay Slot 1
125102
; CHECK-NEXT: // %bb.1: // %if.end
126-
; CHECK-NEXT: mova r16, #3; nopxm
127-
; CHECK-NEXT: vextract.s32 r0, x2, r19
128-
; CHECK-NEXT: vextract.s32 r1, x0, r19
129-
; CHECK-NEXT: vextract.s32 r2, x2, r18
130-
; CHECK-NEXT: vextract.s32 r3, x0, r18
131-
; CHECK-NEXT: vextract.s32 r4, x2, r17
132-
; CHECK-NEXT: vextract.s32 r5, x0, r17
103+
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x2, r16; nopv
104+
; CHECK-NEXT: vextract.s32 r1, x4, r16
105+
; CHECK-NEXT: mova r16, #1
106+
; CHECK-NEXT: vextract.s32 r2, x2, r16
107+
; CHECK-NEXT: vextract.s32 r3, x4, r16
108+
; CHECK-NEXT: mova r16, #2
109+
; CHECK-NEXT: vextract.s32 r4, x2, r16
110+
; CHECK-NEXT: vextract.s32 r5, x4, r16
111+
; CHECK-NEXT: mova r16, #3
133112
; CHECK-NEXT: vextract.s32 r6, x2, r16
134-
; CHECK-NEXT: vextract.s32 r7, x0, r16
113+
; CHECK-NEXT: vextract.s32 r7, x4, r16
135114
; CHECK-NEXT: mova r16, #4
136115
; CHECK-NEXT: vextract.s32 r8, x2, r16
137-
; CHECK-NEXT: vextract.s32 r9, x0, r16
116+
; CHECK-NEXT: vextract.s32 r9, x4, r16
138117
; CHECK-NEXT: mova r16, #5
139118
; CHECK-NEXT: vextract.s32 r10, x2, r16
140-
; CHECK-NEXT: vextract.s32 r11, x0, r16
119+
; CHECK-NEXT: vextract.s32 r11, x4, r16
141120
; CHECK-NEXT: mova r16, #7
142121
; CHECK-NEXT: vextract.s32 r12, x2, r16
143-
; CHECK-NEXT: vextract.s32 r13, x0, r16
122+
; CHECK-NEXT: vextract.s32 r13, x4, r16
144123
; CHECK-NEXT: mova r16, #6
145124
; CHECK-NEXT: vextract.s32 r14, x2, r16
146-
; CHECK-NEXT: vextract.s32 r15, x0, r16
125+
; CHECK-NEXT: vextract.s32 r15, x4, r16
147126
; CHECK-NEXT: vpush.lo.32 x0, r13, x0
148127
; CHECK-NEXT: vpush.lo.32 x0, r15, x0
149128
; CHECK-NEXT: vpush.lo.32 x0, r11, x0
@@ -155,34 +134,37 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <
155134
; CHECK-NEXT: vpush.lo.32 x0, r12, x0
156135
; CHECK-NEXT: vpush.lo.32 x0, r14, x0
157136
; CHECK-NEXT: vpush.lo.32 x0, r10, x0
158-
; CHECK-NEXT: j #.LBB1_3
159-
; CHECK-NEXT: vpush.lo.32 x0, r8, x0 // Delay Slot 5
160-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 4
161-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 3
162-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 2
163-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 1
137+
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
138+
; CHECK-NEXT: ret lr
139+
; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5
140+
; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
141+
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3
142+
; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2
143+
; CHECK-NEXT: mov r16, r24 // Delay Slot 1
164144
; CHECK-NEXT: .p2align 4
165145
; CHECK-NEXT: .LBB1_2: // %if.then
166-
; CHECK-NEXT: nopb ; mova r16, #3; nops ; nopxm ; nopv
167-
; CHECK-NEXT: vextract.s32 r0, x0, r19
168-
; CHECK-NEXT: vextract.s32 r1, x2, r19
169-
; CHECK-NEXT: vextract.s32 r2, x0, r18
170-
; CHECK-NEXT: vextract.s32 r3, x2, r18
171-
; CHECK-NEXT: vextract.s32 r4, x0, r17
172-
; CHECK-NEXT: vextract.s32 r5, x2, r17
173-
; CHECK-NEXT: vextract.s32 r6, x0, r16
146+
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x4, r16; nopv
147+
; CHECK-NEXT: vextract.s32 r1, x2, r16
148+
; CHECK-NEXT: mova r16, #1
149+
; CHECK-NEXT: vextract.s32 r2, x4, r16
150+
; CHECK-NEXT: vextract.s32 r3, x2, r16
151+
; CHECK-NEXT: mova r16, #2
152+
; CHECK-NEXT: vextract.s32 r4, x4, r16
153+
; CHECK-NEXT: vextract.s32 r5, x2, r16
154+
; CHECK-NEXT: mova r16, #3
155+
; CHECK-NEXT: vextract.s32 r6, x4, r16
174156
; CHECK-NEXT: vextract.s32 r7, x2, r16
175157
; CHECK-NEXT: mova r16, #4
176-
; CHECK-NEXT: vextract.s32 r8, x0, r16
158+
; CHECK-NEXT: vextract.s32 r8, x4, r16
177159
; CHECK-NEXT: vextract.s32 r9, x2, r16
178160
; CHECK-NEXT: mova r16, #5
179-
; CHECK-NEXT: vextract.s32 r10, x0, r16
161+
; CHECK-NEXT: vextract.s32 r10, x4, r16
180162
; CHECK-NEXT: vextract.s32 r11, x2, r16
181163
; CHECK-NEXT: mova r16, #7
182-
; CHECK-NEXT: vextract.s32 r12, x0, r16
164+
; CHECK-NEXT: vextract.s32 r12, x4, r16
183165
; CHECK-NEXT: vextract.s32 r13, x2, r16
184166
; CHECK-NEXT: mova r16, #6
185-
; CHECK-NEXT: vextract.s32 r14, x0, r16
167+
; CHECK-NEXT: vextract.s32 r14, x4, r16
186168
; CHECK-NEXT: vextract.s32 r15, x2, r16
187169
; CHECK-NEXT: vpush.lo.32 x0, r13, x0
188170
; CHECK-NEXT: vpush.lo.32 x0, r15, x0
@@ -196,17 +178,11 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <
196178
; CHECK-NEXT: vpush.lo.32 x0, r14, x0
197179
; CHECK-NEXT: vpush.lo.32 x0, r10, x0
198180
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
199-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0
200-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0
201-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0
202-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
203-
; CHECK-NEXT: .p2align 4
204-
; CHECK-NEXT: .LBB1_3: // %cleanup
205-
; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops
206-
; CHECK-NEXT: nop // Delay Slot 5
207-
; CHECK-NEXT: mov r19, r27 // Delay Slot 4
208-
; CHECK-NEXT: mov r18, r26 // Delay Slot 3
209-
; CHECK-NEXT: mov r17, r25 // Delay Slot 2
181+
; CHECK-NEXT: ret lr
182+
; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5
183+
; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
184+
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3
185+
; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2
210186
; CHECK-NEXT: mov r16, r24 // Delay Slot 1
211187
entry:
212188
%shuffle = shufflevector <8 x i32> %b, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -230,49 +206,12 @@ define <16 x i32> @test_concat_vector(<8 x i32> noundef %a, <8 x i32> noundef %b
230206
; CHECK-LABEL: test_concat_vector:
231207
; CHECK: .p2align 4
232208
; CHECK-NEXT: // %bb.0: // %entry
233-
; CHECK-NEXT: nopx ; mov r24, r16
234-
; CHECK-NEXT: mova r16, #0
235-
; CHECK-NEXT: vextract.s32 r0, x2, r16
236-
; CHECK-NEXT: vextract.s32 r1, x4, r16
237-
; CHECK-NEXT: mova r16, #1
238-
; CHECK-NEXT: vextract.s32 r2, x2, r16
239-
; CHECK-NEXT: vextract.s32 r3, x4, r16
240-
; CHECK-NEXT: mova r16, #2
241-
; CHECK-NEXT: vextract.s32 r4, x2, r16
242-
; CHECK-NEXT: vextract.s32 r5, x4, r16
243-
; CHECK-NEXT: mova r16, #3
244-
; CHECK-NEXT: vextract.s32 r6, x2, r16
245-
; CHECK-NEXT: vextract.s32 r7, x4, r16
246-
; CHECK-NEXT: mova r16, #4
247-
; CHECK-NEXT: vextract.s32 r8, x2, r16
248-
; CHECK-NEXT: vextract.s32 r9, x4, r16
249-
; CHECK-NEXT: mova r16, #5
250-
; CHECK-NEXT: vextract.s32 r10, x2, r16
251-
; CHECK-NEXT: vextract.s32 r11, x4, r16
252-
; CHECK-NEXT: mova r16, #7
253-
; CHECK-NEXT: vextract.s32 r12, x2, r16
254-
; CHECK-NEXT: vextract.s32 r13, x4, r16
255-
; CHECK-NEXT: mova r16, #6
256-
; CHECK-NEXT: vextract.s32 r14, x2, r16
257-
; CHECK-NEXT: vextract.s32 r15, x4, r16
258-
; CHECK-NEXT: vpush.lo.32 x0, r13, x0
259-
; CHECK-NEXT: vpush.lo.32 x0, r15, x0
260-
; CHECK-NEXT: vpush.lo.32 x0, r11, x0
261-
; CHECK-NEXT: vpush.lo.32 x0, r9, x0
262-
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
263-
; CHECK-NEXT: vpush.lo.32 x0, r5, x0
264-
; CHECK-NEXT: vpush.lo.32 x0, r3, x0
265-
; CHECK-NEXT: vpush.lo.32 x0, r1, x0
266-
; CHECK-NEXT: vpush.lo.32 x0, r12, x0
267-
; CHECK-NEXT: vpush.lo.32 x0, r14, x0
268-
; CHECK-NEXT: vpush.lo.32 x0, r10, x0
269-
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
270-
; CHECK-NEXT: ret lr
271-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5
272-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
273-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3
274-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2
275-
; CHECK-NEXT: mov r16, r24 // Delay Slot 1
209+
; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv
210+
; CHECK-NEXT: nopx // Delay Slot 5
211+
; CHECK-NEXT: nop // Delay Slot 4
212+
; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 3
213+
; CHECK-NEXT: vmov wh0, wl4 // Delay Slot 2
214+
; CHECK-NEXT: nop // Delay Slot 1
276215
entry:
277216
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
278217
ret <16 x i32> %shuffle
@@ -282,44 +221,14 @@ define <16 x i32> @test_set_vector(i32 noundef %idx, <8 x i32> noundef %a) {
282221
; CHECK-LABEL: test_set_vector:
283222
; CHECK: .p2align 4
284223
; CHECK-NEXT: // %bb.0: // %entry
285-
; CHECK-NEXT: nopa ; nopb ; nopx ; mov r9, r16
286-
; CHECK-NEXT: mova r16, #0
224+
; CHECK-NEXT: mov r1, r16
287225
; CHECK-NEXT: eqz r0, r0
288-
; CHECK-NEXT: vextract.s32 r1, x2, r16
289-
; CHECK-NEXT: mova r16, #1
290-
; CHECK-NEXT: vextract.s32 r2, x2, r16
291-
; CHECK-NEXT: mova r16, #2
292-
; CHECK-NEXT: vextract.s32 r3, x2, r16
293-
; CHECK-NEXT: mova r16, #3
294-
; CHECK-NEXT: vextract.s32 r4, x2, r16
295-
; CHECK-NEXT: mova r16, #4
296-
; CHECK-NEXT: vextract.s32 r5, x2, r16
297-
; CHECK-NEXT: mova r16, #5
298-
; CHECK-NEXT: vextract.s32 r6, x2, r16
299-
; CHECK-NEXT: mova r16, #7
300-
; CHECK-NEXT: vextract.s32 r7, x2, r16
301-
; CHECK-NEXT: mova r16, #6
302-
; CHECK-NEXT: vextract.s32 r8, x2, r16
303-
; CHECK-NEXT: add r16, r0, #-1
304-
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
305-
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
306-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0
307-
; CHECK-NEXT: vpush.lo.32 x0, r5, x0
308-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0
309-
; CHECK-NEXT: vpush.lo.32 x0, r3, x0
310-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0
311-
; CHECK-NEXT: vpush.lo.32 x0, r1, x0
312-
; CHECK-NEXT: vpush.lo.32 x2, r0, x0
313-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
314-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
315-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
316-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
317226
; CHECK-NEXT: ret lr
318-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 5
319-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 4
320-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 3
321-
; CHECK-NEXT: vsel.32 x0, x0, x2, r16 // Delay Slot 2
322-
; CHECK-NEXT: mov r16, r9 // Delay Slot 1
227+
; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 5
228+
; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 4
229+
; CHECK-NEXT: add r16, r0, #-1 // Delay Slot 3
230+
; CHECK-NEXT: vsel.32 x0, x0, x0, r16 // Delay Slot 2
231+
; CHECK-NEXT: mov r16, r1 // Delay Slot 1
323232
entry:
324233
%cmp = icmp eq i32 %idx, 0
325234
%shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>

0 commit comments

Comments
 (0)