Skip to content

Commit 2ae63a8

Browse files
[AIE2] Enable G_CONCAT_VECTOR optimizations for AIE2
1 parent 895c081 commit 2ae63a8

File tree

2 files changed

+58
-155
lines changed

2 files changed

+58
-155
lines changed

llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
2323
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
2424
#include "llvm/CodeGen/MachineDominators.h"
25+
#include "llvm/CodeGen/TargetOpcodes.h"
2526
#include "llvm/IR/IntrinsicsAIE2.h"
2627
#include "llvm/InitializePasses.h"
2728

@@ -167,6 +168,9 @@ bool AIE2PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
167168
case TargetOpcode::G_INTRINSIC: {
168169
return tryToCombineIntrinsic(MI);
169170
}
171+
case TargetOpcode::G_SHUFFLE_VECTOR: {
172+
return Helper.tryCombineShuffleVector(MI);
173+
}
170174
default:
171175
break;
172176
}

llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll

Lines changed: 54 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -99,100 +99,79 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <
9999
; CHECK-LABEL: test_insert_vector:
100100
; CHECK: .p2align 4
101101
; CHECK-NEXT: // %bb.0: // %entry
102-
; CHECK-NEXT: nopx ; mov r25, r17
103-
; CHECK-NEXT: mov r26, r18
104-
; CHECK-NEXT: mov r27, r19
105-
; CHECK-NEXT: mova r19, #0
106-
; CHECK-NEXT: mova r18, #1
107-
; CHECK-NEXT: mov r24, r16
108-
; CHECK-NEXT: mova r16, #3
109-
; CHECK-NEXT: vextract.s32 r4, x4, r16
110-
; CHECK-NEXT: movx r17, #2
111-
; CHECK-NEXT: mova r16, #4
112-
; CHECK-NEXT: vextract.s32 r1, x4, r19
113-
; CHECK-NEXT: vextract.s32 r2, x4, r18
114-
; CHECK-NEXT: vextract.s32 r3, x4, r17
115-
; CHECK-NEXT: vextract.s32 r5, x4, r16
102+
; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv
103+
; CHECK-NEXT: nopx // Delay Slot 5
104+
; CHECK-NEXT: nop // Delay Slot 4
105+
; CHECK-NEXT: nop // Delay Slot 3
106+
; CHECK-NEXT: mov r24, r16 // Delay Slot 2
107+
; CHECK-NEXT: mova r16, #0 // Delay Slot 1
108+
; CHECK-NEXT: // %bb.1: // %if.end
109+
; CHECK-NEXT: vextract.s32 r0, x2, r16
110+
; CHECK-NEXT: vextract.s32 r1, x4, r16
116111
; CHECK-NEXT: nop
117-
; CHECK-NEXT: mova r16, #5
118-
; CHECK-NEXT: vextract.s32 r6, x4, r16
112+
; CHECK-NEXT: mova r16, #1
113+
; CHECK-NEXT: vextract.s32 r2, x2, r16
114+
; CHECK-NEXT: vextract.s32 r3, x4, r16
119115
; CHECK-NEXT: nop
120-
; CHECK-NEXT: mova r16, #7
121-
; CHECK-NEXT: vextract.s32 r7, x4, r16
116+
; CHECK-NEXT: mova r16, #2
117+
; CHECK-NEXT: vextract.s32 r4, x2, r16
118+
; CHECK-NEXT: vextract.s32 r5, x4, r16
122119
; CHECK-NEXT: nop
123-
; CHECK-NEXT: mova r16, #6
124-
; CHECK-NEXT: vextract.s32 r8, x4, r16
125-
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
126-
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
127-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0
128-
; CHECK-NEXT: jz r0, #.LBB1_2
129-
; CHECK-NEXT: vpush.lo.32 x0, r5, x0 // Delay Slot 5
130-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
131-
; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 3
132-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 2
133-
; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 1
134-
; CHECK-NEXT: // %bb.1: // %if.end
135-
; CHECK-NEXT: nopb ; mova r16, #3; nops ; nopxm ; nopv
136-
; CHECK-NEXT: vextract.s32 r0, x2, r19
137-
; CHECK-NEXT: vextract.s32 r1, x0, r19
138-
; CHECK-NEXT: vextract.s32 r2, x2, r18
139-
; CHECK-NEXT: vextract.s32 r3, x0, r18
140-
; CHECK-NEXT: vextract.s32 r4, x2, r17
141-
; CHECK-NEXT: vextract.s32 r5, x0, r17
120+
; CHECK-NEXT: mova r16, #3
142121
; CHECK-NEXT: vextract.s32 r6, x2, r16
143-
; CHECK-NEXT: vextract.s32 r7, x0, r16
122+
; CHECK-NEXT: vextract.s32 r7, x4, r16
144123
; CHECK-NEXT: nop
145124
; CHECK-NEXT: mova r16, #4
146125
; CHECK-NEXT: vextract.s32 r8, x2, r16
147-
; CHECK-NEXT: vextract.s32 r9, x0, r16
126+
; CHECK-NEXT: vextract.s32 r9, x4, r16
148127
; CHECK-NEXT: nop
149128
; CHECK-NEXT: mova r16, #5
150129
; CHECK-NEXT: vextract.s32 r10, x2, r16
151-
; CHECK-NEXT: vextract.s32 r11, x0, r16
130+
; CHECK-NEXT: vextract.s32 r11, x4, r16
152131
; CHECK-NEXT: nop
153132
; CHECK-NEXT: mova r16, #7
154133
; CHECK-NEXT: vextract.s32 r12, x2, r16
155-
; CHECK-NEXT: vextract.s32 r13, x0, r16
156134
; CHECK-NEXT: j #.LBB1_3
157-
; CHECK-NEXT: nop // Delay Slot 5
158-
; CHECK-NEXT: mova r16, #6 // Delay Slot 4
159-
; CHECK-NEXT: vextract.s32 r14, x2, r16 // Delay Slot 3
160-
; CHECK-NEXT: vextract.s32 r15, x0, r16 // Delay Slot 2
161-
; CHECK-NEXT: nop // Delay Slot 1
135+
; CHECK-NEXT: vextract.s32 r13, x4, r16 // Delay Slot 5
136+
; CHECK-NEXT: nop // Delay Slot 4
137+
; CHECK-NEXT: mova r16, #6 // Delay Slot 3
138+
; CHECK-NEXT: vextract.s32 r14, x2, r16 // Delay Slot 2
139+
; CHECK-NEXT: vextract.s32 r15, x4, r16 // Delay Slot 1
162140
; CHECK-NEXT: .p2align 4
163141
; CHECK-NEXT: .LBB1_2: // %if.then
164-
; CHECK-NEXT: mova r16, #3; nopx
165-
; CHECK-NEXT: vextract.s32 r0, x0, r19
166-
; CHECK-NEXT: vextract.s32 r1, x2, r19
167-
; CHECK-NEXT: vextract.s32 r2, x0, r18
168-
; CHECK-NEXT: vextract.s32 r3, x2, r18
169-
; CHECK-NEXT: vextract.s32 r4, x0, r17
170-
; CHECK-NEXT: vextract.s32 r5, x2, r17
171-
; CHECK-NEXT: vextract.s32 r6, x0, r16
142+
; CHECK-NEXT: nopa ; nopx ; vextract.s32 r0, x4, r16
143+
; CHECK-NEXT: vextract.s32 r1, x2, r16
144+
; CHECK-NEXT: nop
145+
; CHECK-NEXT: mova r16, #1
146+
; CHECK-NEXT: vextract.s32 r2, x4, r16
147+
; CHECK-NEXT: vextract.s32 r3, x2, r16
148+
; CHECK-NEXT: nop
149+
; CHECK-NEXT: mova r16, #2
150+
; CHECK-NEXT: vextract.s32 r4, x4, r16
151+
; CHECK-NEXT: vextract.s32 r5, x2, r16
152+
; CHECK-NEXT: nop
153+
; CHECK-NEXT: mova r16, #3
154+
; CHECK-NEXT: vextract.s32 r6, x4, r16
172155
; CHECK-NEXT: vextract.s32 r7, x2, r16
173156
; CHECK-NEXT: nop
174157
; CHECK-NEXT: mova r16, #4
175-
; CHECK-NEXT: vextract.s32 r8, x0, r16
158+
; CHECK-NEXT: vextract.s32 r8, x4, r16
176159
; CHECK-NEXT: vextract.s32 r9, x2, r16
177160
; CHECK-NEXT: nop
178161
; CHECK-NEXT: mova r16, #5
179-
; CHECK-NEXT: vextract.s32 r10, x0, r16
162+
; CHECK-NEXT: vextract.s32 r10, x4, r16
180163
; CHECK-NEXT: vextract.s32 r11, x2, r16
181164
; CHECK-NEXT: nop
182165
; CHECK-NEXT: mova r16, #7
183-
; CHECK-NEXT: vextract.s32 r12, x0, r16
166+
; CHECK-NEXT: vextract.s32 r12, x4, r16
184167
; CHECK-NEXT: vextract.s32 r13, x2, r16
185168
; CHECK-NEXT: nop
186169
; CHECK-NEXT: mova r16, #6
187-
; CHECK-NEXT: vextract.s32 r14, x0, r16
170+
; CHECK-NEXT: vextract.s32 r14, x4, r16
188171
; CHECK-NEXT: vextract.s32 r15, x2, r16
189-
; CHECK-NEXT: nop
190172
; CHECK-NEXT: .p2align 4
191173
; CHECK-NEXT: .LBB1_3: // %cleanup
192-
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r19, r27; nopv
193-
; CHECK-NEXT: mov r18, r26
194-
; CHECK-NEXT: mov r17, r25
195-
; CHECK-NEXT: vpush.lo.32 x0, r13, x0
174+
; CHECK-NEXT: nopa ; nopb ; nopx ; vpush.lo.32 x0, r13, x0
196175
; CHECK-NEXT: vpush.lo.32 x0, r15, x0
197176
; CHECK-NEXT: vpush.lo.32 x0, r11, x0
198177
; CHECK-NEXT: vpush.lo.32 x0, r9, x0
@@ -232,56 +211,12 @@ define <16 x i32> @test_concat_vector(<8 x i32> noundef %a, <8 x i32> noundef %b
232211
; CHECK-LABEL: test_concat_vector:
233212
; CHECK: .p2align 4
234213
; CHECK-NEXT: // %bb.0: // %entry
235-
; CHECK-NEXT: nopa ; nopx ; mov r24, r16
236-
; CHECK-NEXT: mova r16, #0
237-
; CHECK-NEXT: vextract.s32 r0, x2, r16
238-
; CHECK-NEXT: vextract.s32 r1, x4, r16
239-
; CHECK-NEXT: nop
240-
; CHECK-NEXT: mova r16, #1
241-
; CHECK-NEXT: vextract.s32 r2, x2, r16
242-
; CHECK-NEXT: vextract.s32 r3, x4, r16
243-
; CHECK-NEXT: nop
244-
; CHECK-NEXT: mova r16, #2
245-
; CHECK-NEXT: vextract.s32 r4, x2, r16
246-
; CHECK-NEXT: vextract.s32 r5, x4, r16
247-
; CHECK-NEXT: nop
248-
; CHECK-NEXT: mova r16, #3
249-
; CHECK-NEXT: vextract.s32 r6, x2, r16
250-
; CHECK-NEXT: vextract.s32 r7, x4, r16
251-
; CHECK-NEXT: nop
252-
; CHECK-NEXT: mova r16, #4
253-
; CHECK-NEXT: vextract.s32 r8, x2, r16
254-
; CHECK-NEXT: vextract.s32 r9, x4, r16
255-
; CHECK-NEXT: nop
256-
; CHECK-NEXT: mova r16, #5
257-
; CHECK-NEXT: vextract.s32 r10, x2, r16
258-
; CHECK-NEXT: vextract.s32 r11, x4, r16
259-
; CHECK-NEXT: nop
260-
; CHECK-NEXT: mova r16, #7
261-
; CHECK-NEXT: vextract.s32 r12, x2, r16
262-
; CHECK-NEXT: vextract.s32 r13, x4, r16
263-
; CHECK-NEXT: nop
264-
; CHECK-NEXT: mova r16, #6
265-
; CHECK-NEXT: vextract.s32 r14, x2, r16
266-
; CHECK-NEXT: vextract.s32 r15, x4, r16
267-
; CHECK-NEXT: vpush.lo.32 x0, r13, x0
268-
; CHECK-NEXT: vpush.lo.32 x0, r15, x0
269-
; CHECK-NEXT: vpush.lo.32 x0, r11, x0
270-
; CHECK-NEXT: vpush.lo.32 x0, r9, x0
271-
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
272-
; CHECK-NEXT: vpush.lo.32 x0, r5, x0
273-
; CHECK-NEXT: vpush.lo.32 x0, r3, x0
274-
; CHECK-NEXT: vpush.lo.32 x0, r1, x0
275-
; CHECK-NEXT: vpush.lo.32 x0, r12, x0
276-
; CHECK-NEXT: vpush.lo.32 x0, r14, x0
277-
; CHECK-NEXT: vpush.lo.32 x0, r10, x0
278-
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
279-
; CHECK-NEXT: ret lr
280-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5
281-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
282-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3
283-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2
284-
; CHECK-NEXT: mov r16, r24 // Delay Slot 1
214+
; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv
215+
; CHECK-NEXT: nopx // Delay Slot 5
216+
; CHECK-NEXT: nop // Delay Slot 4
217+
; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 3
218+
; CHECK-NEXT: vmov wh0, wl4 // Delay Slot 2
219+
; CHECK-NEXT: nop // Delay Slot 1
285220
entry:
286221
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
287222
ret <16 x i32> %shuffle
@@ -291,50 +226,14 @@ define <16 x i32> @test_set_vector(i32 noundef %idx, <8 x i32> noundef %a) {
291226
; CHECK-LABEL: test_set_vector:
292227
; CHECK: .p2align 4
293228
; CHECK-NEXT: // %bb.0: // %entry
294-
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r9, r16; nopv
295-
; CHECK-NEXT: mova r16, #0
296-
; CHECK-NEXT: vextract.s32 r1, x2, r16
297-
; CHECK-NEXT: nop
298-
; CHECK-NEXT: mova r16, #1
299-
; CHECK-NEXT: vextract.s32 r2, x2, r16
300-
; CHECK-NEXT: nop
301-
; CHECK-NEXT: mova r16, #2
302-
; CHECK-NEXT: vextract.s32 r3, x2, r16
303-
; CHECK-NEXT: nop
304-
; CHECK-NEXT: mova r16, #3
305-
; CHECK-NEXT: vextract.s32 r4, x2, r16
306-
; CHECK-NEXT: nop
307-
; CHECK-NEXT: mova r16, #4
308-
; CHECK-NEXT: vextract.s32 r5, x2, r16
309-
; CHECK-NEXT: nop
310-
; CHECK-NEXT: mova r16, #5
311-
; CHECK-NEXT: vextract.s32 r6, x2, r16
312-
; CHECK-NEXT: nop
313-
; CHECK-NEXT: mova r16, #7
314-
; CHECK-NEXT: vextract.s32 r7, x2, r16
229+
; CHECK-NEXT: mov r1, r16
315230
; CHECK-NEXT: eqz r0, r0
316-
; CHECK-NEXT: mova r16, #6
317-
; CHECK-NEXT: vextract.s32 r8, x2, r16
318-
; CHECK-NEXT: add r16, r0, #-1
319-
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
320-
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
321-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0
322-
; CHECK-NEXT: vpush.lo.32 x0, r5, x0
323-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0
324-
; CHECK-NEXT: vpush.lo.32 x0, r3, x0
325-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0
326-
; CHECK-NEXT: vpush.lo.32 x0, r1, x0
327-
; CHECK-NEXT: vpush.lo.32 x2, r0, x0
328-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
329-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
330-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
331-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
332231
; CHECK-NEXT: ret lr
333-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 5
334-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 4
335-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 3
336-
; CHECK-NEXT: vsel.32 x0, x0, x2, r16 // Delay Slot 2
337-
; CHECK-NEXT: mov r16, r9 // Delay Slot 1
232+
; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 5
233+
; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 4
234+
; CHECK-NEXT: add r16, r0, #-1 // Delay Slot 3
235+
; CHECK-NEXT: vsel.32 x0, x0, x0, r16 // Delay Slot 2
236+
; CHECK-NEXT: mov r16, r1 // Delay Slot 1
338237
entry:
339238
%cmp = icmp eq i32 %idx, 0
340239
%shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>

0 commit comments

Comments
 (0)