Skip to content

Commit 52f054f

Browse files
[AIE2] Enable G_CONCAT_VECTOR optimizations for AIE2
1 parent 3e692b4 commit 52f054f

File tree

3 files changed

+88
-186
lines changed

3 files changed

+88
-186
lines changed

llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
2323
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
2424
#include "llvm/CodeGen/MachineDominators.h"
25+
#include "llvm/CodeGen/TargetOpcodes.h"
2526
#include "llvm/InitializePasses.h"
2627

2728
#define DEBUG_TYPE "aie2-prelegalizer-combiner"
@@ -54,8 +55,8 @@ class AIE2PreLegalizerCombinerImpl : public Combiner {
5455
const LegalizerInfo *LI);
5556

5657
static const char *getName() { return "AIE2PreLegalizerCombiner"; }
57-
5858
bool tryCombineAll(MachineInstr &I) const override;
59+
bool tryCombineAllImpl(MachineInstr &I) const;
5960

6061
private:
6162
#define GET_GICOMBINER_CLASS_MEMBERS
@@ -83,6 +84,18 @@ AIE2PreLegalizerCombinerImpl::AIE2PreLegalizerCombinerImpl(
8384
{
8485
}
8586

87+
bool AIE2PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
88+
if (tryCombineAllImpl(MI))
89+
return true;
90+
91+
unsigned Opc = MI.getOpcode();
92+
switch (Opc) {
93+
case TargetOpcode::G_SHUFFLE_VECTOR:
94+
return Helper.tryCombineShuffleVector(MI);
95+
}
96+
return false;
97+
}
98+
8699
class AIE2PreLegalizerCombiner : public MachineFunctionPass {
87100
public:
88101
static char ID;

llvm/lib/Target/AIE/AIECombine.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def AIE2PreLegalizerCombiner
6363
combine_globalval_offset,
6464
combine_extract_vector_elt_and_zsa_ext,
6565
combine_splat_vector ]> {
66+
let CombineAllMethodName = "tryCombineAllImpl";
6667
}
6768

6869
def AIE2PostLegalizerGenericCombiner

llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll

Lines changed: 73 additions & 185 deletions
Original file line numberDiff line numberDiff line change
@@ -99,100 +99,79 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <
9999
; CHECK-LABEL: test_insert_vector:
100100
; CHECK: .p2align 4
101101
; CHECK-NEXT: // %bb.0: // %entry
102-
; CHECK-NEXT: nopa ; nopb ; nopx ; mov r25, r17
103-
; CHECK-NEXT: mov r26, r18
104-
; CHECK-NEXT: mov r27, r19
105-
; CHECK-NEXT: mova r19, #0
106-
; CHECK-NEXT: mova r18, #1
107-
; CHECK-NEXT: mova r17, #2
108-
; CHECK-NEXT: mov r24, r16
102+
; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv
103+
; CHECK-NEXT: nopx // Delay Slot 5
104+
; CHECK-NEXT: nop // Delay Slot 4
105+
; CHECK-NEXT: nop // Delay Slot 3
106+
; CHECK-NEXT: mov r24, r16 // Delay Slot 2
107+
; CHECK-NEXT: mova r16, #0 // Delay Slot 1
108+
; CHECK-NEXT: // %bb.1: // %if.end
109+
; CHECK-NEXT: vextract.s32 r0, x2, r16
110+
; CHECK-NEXT: vextract.s32 r1, x4, r16
111+
; CHECK-NEXT: nop
112+
; CHECK-NEXT: mova r16, #1
113+
; CHECK-NEXT: vextract.s32 r2, x2, r16
114+
; CHECK-NEXT: vextract.s32 r3, x4, r16
115+
; CHECK-NEXT: nop
116+
; CHECK-NEXT: mova r16, #2
117+
; CHECK-NEXT: vextract.s32 r4, x2, r16
118+
; CHECK-NEXT: vextract.s32 r5, x4, r16
119+
; CHECK-NEXT: nop
109120
; CHECK-NEXT: mova r16, #3
110-
; CHECK-NEXT: vextract.s32 r4, x4, r16
111-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
121+
; CHECK-NEXT: vextract.s32 r6, x2, r16
122+
; CHECK-NEXT: vextract.s32 r7, x4, r16
123+
; CHECK-NEXT: nop
112124
; CHECK-NEXT: mova r16, #4
113-
; CHECK-NEXT: vextract.s32 r1, x4, r19
114-
; CHECK-NEXT: vextract.s32 r2, x4, r18
115-
; CHECK-NEXT: vextract.s32 r3, x4, r17
116-
; CHECK-NEXT: vextract.s32 r5, x4, r16
117-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
125+
; CHECK-NEXT: vextract.s32 r8, x2, r16
126+
; CHECK-NEXT: vextract.s32 r9, x4, r16
127+
; CHECK-NEXT: nop
118128
; CHECK-NEXT: mova r16, #5
119-
; CHECK-NEXT: vextract.s32 r6, x4, r16
120-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
121-
; CHECK-NEXT: mova r16, #6
122-
; CHECK-NEXT: vextract.s32 r7, x4, r16
123-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
124-
; CHECK-NEXT: mova r16, #7
125-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
126-
; CHECK-NEXT: vextract.s32 r8, x4, r16
127-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
128-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
129-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
130-
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
131-
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
132-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0
133-
; CHECK-NEXT: vpush.lo.32 x0, r5, x0
134-
; CHECK-NEXT: jz r0, #.LBB1_2
135-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 5
136-
; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 4
137-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3
138-
; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 2
139-
; CHECK-NEXT: nop // Delay Slot 1
140-
; CHECK-NEXT: // %bb.1: // %if.end
141-
; CHECK-NEXT: nopx ; vextract.s32 r12, x2, r16
142-
; CHECK-NEXT: vextract.s32 r13, x0, r16
143-
; CHECK-NEXT: vextract.s32 r4, x2, r17
144-
; CHECK-NEXT: vextract.s32 r5, x0, r17
129+
; CHECK-NEXT: vextract.s32 r10, x2, r16
130+
; CHECK-NEXT: vextract.s32 r11, x4, r16
145131
; CHECK-NEXT: nop
146-
; CHECK-NEXT: mova r17, #3
147-
; CHECK-NEXT: vextract.s32 r0, x2, r19
148-
; CHECK-NEXT: vextract.s32 r1, x0, r19
149-
; CHECK-NEXT: vextract.s32 r2, x2, r18
150-
; CHECK-NEXT: vextract.s32 r3, x0, r18
151-
; CHECK-NEXT: vextract.s32 r6, x2, r17
152-
; CHECK-NEXT: vextract.s32 r7, x0, r17
153-
; CHECK-NEXT: movx r16, #6
154-
; CHECK-NEXT: mova r17, #4
155-
; CHECK-NEXT: vextract.s32 r14, x2, r16
156-
; CHECK-NEXT: vextract.s32 r15, x0, r16
157-
; CHECK-NEXT: vextract.s32 r8, x2, r17
158-
; CHECK-NEXT: vextract.s32 r9, x0, r17
132+
; CHECK-NEXT: mova r16, #7
133+
; CHECK-NEXT: vextract.s32 r12, x2, r16
159134
; CHECK-NEXT: j #.LBB1_3
160-
; CHECK-NEXT: nop // Delay Slot 5
161-
; CHECK-NEXT: mova r17, #5 // Delay Slot 4
162-
; CHECK-NEXT: vextract.s32 r10, x2, r17 // Delay Slot 3
163-
; CHECK-NEXT: vextract.s32 r11, x0, r17 // Delay Slot 2
164-
; CHECK-NEXT: nop // Delay Slot 1
135+
; CHECK-NEXT: vextract.s32 r13, x4, r16 // Delay Slot 5
136+
; CHECK-NEXT: nop // Delay Slot 4
137+
; CHECK-NEXT: mova r16, #6 // Delay Slot 3
138+
; CHECK-NEXT: vextract.s32 r14, x2, r16 // Delay Slot 2
139+
; CHECK-NEXT: vextract.s32 r15, x4, r16 // Delay Slot 1
165140
; CHECK-NEXT: .p2align 4
166141
; CHECK-NEXT: .LBB1_2: // %if.then
167-
; CHECK-NEXT: nopa ; nopb ; nopx ; vextract.s32 r12, x0, r16; nops
168-
; CHECK-NEXT: vextract.s32 r13, x2, r16
169-
; CHECK-NEXT: vextract.s32 r4, x0, r17
170-
; CHECK-NEXT: vextract.s32 r5, x2, r17
142+
; CHECK-NEXT: nopa ; nopx ; vextract.s32 r0, x4, r16
143+
; CHECK-NEXT: vextract.s32 r1, x2, r16
171144
; CHECK-NEXT: nop
172-
; CHECK-NEXT: mova r17, #3
173-
; CHECK-NEXT: vextract.s32 r0, x0, r19
174-
; CHECK-NEXT: vextract.s32 r1, x2, r19
175-
; CHECK-NEXT: vextract.s32 r2, x0, r18
176-
; CHECK-NEXT: vextract.s32 r3, x2, r18
177-
; CHECK-NEXT: vextract.s32 r6, x0, r17
178-
; CHECK-NEXT: vextract.s32 r7, x2, r17
179-
; CHECK-NEXT: movx r16, #6
180-
; CHECK-NEXT: mova r17, #4
181-
; CHECK-NEXT: vextract.s32 r14, x0, r16
182-
; CHECK-NEXT: vextract.s32 r15, x2, r16
183-
; CHECK-NEXT: vextract.s32 r8, x0, r17
184-
; CHECK-NEXT: vextract.s32 r9, x2, r17
145+
; CHECK-NEXT: mova r16, #1
146+
; CHECK-NEXT: vextract.s32 r2, x4, r16
147+
; CHECK-NEXT: vextract.s32 r3, x2, r16
185148
; CHECK-NEXT: nop
186-
; CHECK-NEXT: mova r17, #5
187-
; CHECK-NEXT: vextract.s32 r10, x0, r17
188-
; CHECK-NEXT: vextract.s32 r11, x2, r17
149+
; CHECK-NEXT: mova r16, #2
150+
; CHECK-NEXT: vextract.s32 r4, x4, r16
151+
; CHECK-NEXT: vextract.s32 r5, x2, r16
189152
; CHECK-NEXT: nop
153+
; CHECK-NEXT: mova r16, #3
154+
; CHECK-NEXT: vextract.s32 r6, x4, r16
155+
; CHECK-NEXT: vextract.s32 r7, x2, r16
156+
; CHECK-NEXT: nop
157+
; CHECK-NEXT: mova r16, #4
158+
; CHECK-NEXT: vextract.s32 r8, x4, r16
159+
; CHECK-NEXT: vextract.s32 r9, x2, r16
160+
; CHECK-NEXT: nop
161+
; CHECK-NEXT: mova r16, #5
162+
; CHECK-NEXT: vextract.s32 r10, x4, r16
163+
; CHECK-NEXT: vextract.s32 r11, x2, r16
164+
; CHECK-NEXT: nop
165+
; CHECK-NEXT: mova r16, #7
166+
; CHECK-NEXT: vextract.s32 r12, x4, r16
167+
; CHECK-NEXT: vextract.s32 r13, x2, r16
168+
; CHECK-NEXT: nop
169+
; CHECK-NEXT: mova r16, #6
170+
; CHECK-NEXT: vextract.s32 r14, x4, r16
171+
; CHECK-NEXT: vextract.s32 r15, x2, r16
190172
; CHECK-NEXT: .p2align 4
191173
; CHECK-NEXT: .LBB1_3: // %cleanup
192-
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r19, r27; nopv
193-
; CHECK-NEXT: mov r18, r26
194-
; CHECK-NEXT: mov r17, r25
195-
; CHECK-NEXT: vpush.lo.32 x0, r13, x0
174+
; CHECK-NEXT: nopa ; nopb ; nopx ; vpush.lo.32 x0, r13, x0
196175
; CHECK-NEXT: vpush.lo.32 x0, r15, x0
197176
; CHECK-NEXT: vpush.lo.32 x0, r11, x0
198177
; CHECK-NEXT: vpush.lo.32 x0, r9, x0
@@ -232,56 +211,12 @@ define <16 x i32> @test_concat_vector(<8 x i32> noundef %a, <8 x i32> noundef %b
232211
; CHECK-LABEL: test_concat_vector:
233212
; CHECK: .p2align 4
234213
; CHECK-NEXT: // %bb.0: // %entry
235-
; CHECK-NEXT: nopa ; nopx ; mov r24, r16
236-
; CHECK-NEXT: mova r16, #0
237-
; CHECK-NEXT: vextract.s32 r0, x2, r16
238-
; CHECK-NEXT: vextract.s32 r1, x4, r16
239-
; CHECK-NEXT: nop
240-
; CHECK-NEXT: mova r16, #1
241-
; CHECK-NEXT: vextract.s32 r2, x2, r16
242-
; CHECK-NEXT: vextract.s32 r3, x4, r16
243-
; CHECK-NEXT: nop
244-
; CHECK-NEXT: mova r16, #2
245-
; CHECK-NEXT: vextract.s32 r4, x2, r16
246-
; CHECK-NEXT: vextract.s32 r5, x4, r16
247-
; CHECK-NEXT: nop
248-
; CHECK-NEXT: mova r16, #3
249-
; CHECK-NEXT: vextract.s32 r6, x2, r16
250-
; CHECK-NEXT: vextract.s32 r7, x4, r16
251-
; CHECK-NEXT: nop
252-
; CHECK-NEXT: mova r16, #4
253-
; CHECK-NEXT: vextract.s32 r8, x2, r16
254-
; CHECK-NEXT: vextract.s32 r9, x4, r16
255-
; CHECK-NEXT: nop
256-
; CHECK-NEXT: mova r16, #5
257-
; CHECK-NEXT: vextract.s32 r10, x2, r16
258-
; CHECK-NEXT: vextract.s32 r11, x4, r16
259-
; CHECK-NEXT: nop
260-
; CHECK-NEXT: mova r16, #7
261-
; CHECK-NEXT: vextract.s32 r12, x2, r16
262-
; CHECK-NEXT: vextract.s32 r13, x4, r16
263-
; CHECK-NEXT: nop
264-
; CHECK-NEXT: mova r16, #6
265-
; CHECK-NEXT: vextract.s32 r14, x2, r16
266-
; CHECK-NEXT: vextract.s32 r15, x4, r16
267-
; CHECK-NEXT: vpush.lo.32 x0, r13, x0
268-
; CHECK-NEXT: vpush.lo.32 x0, r15, x0
269-
; CHECK-NEXT: vpush.lo.32 x0, r11, x0
270-
; CHECK-NEXT: vpush.lo.32 x0, r9, x0
271-
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
272-
; CHECK-NEXT: vpush.lo.32 x0, r5, x0
273-
; CHECK-NEXT: vpush.lo.32 x0, r3, x0
274-
; CHECK-NEXT: vpush.lo.32 x0, r1, x0
275-
; CHECK-NEXT: vpush.lo.32 x0, r12, x0
276-
; CHECK-NEXT: vpush.lo.32 x0, r14, x0
277-
; CHECK-NEXT: vpush.lo.32 x0, r10, x0
278-
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
279-
; CHECK-NEXT: ret lr
280-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5
281-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
282-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3
283-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2
284-
; CHECK-NEXT: mov r16, r24 // Delay Slot 1
214+
; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv
215+
; CHECK-NEXT: nopx // Delay Slot 5
216+
; CHECK-NEXT: nop // Delay Slot 4
217+
; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 3
218+
; CHECK-NEXT: vmov wh0, wl4 // Delay Slot 2
219+
; CHECK-NEXT: nop // Delay Slot 1
285220
entry:
286221
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
287222
ret <16 x i32> %shuffle
@@ -291,60 +226,13 @@ define <16 x i32> @test_set_vector(i32 noundef %idx, <8 x i32> noundef %a) {
291226
; CHECK-LABEL: test_set_vector:
292227
; CHECK: .p2align 4
293228
; CHECK-NEXT: // %bb.0: // %entry
294-
; CHECK-NEXT: nopa ; nopb ; nopx ; mov r9, r16
295-
; CHECK-NEXT: mova r16, #0
296-
; CHECK-NEXT: vextract.s32 r1, x2, r16
297-
; CHECK-NEXT: eqz r0, r0
298-
; CHECK-NEXT: mova r16, #1
299-
; CHECK-NEXT: vextract.s32 r2, x2, r16
300-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
301-
; CHECK-NEXT: mova r16, #2
302-
; CHECK-NEXT: vextract.s32 r3, x2, r16
303-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
304-
; CHECK-NEXT: mova r16, #3
305-
; CHECK-NEXT: vextract.s32 r4, x2, r16
306-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
307-
; CHECK-NEXT: mova r16, #4
308-
; CHECK-NEXT: vextract.s32 r5, x2, r16
309-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
310-
; CHECK-NEXT: mova r16, #5
311-
; CHECK-NEXT: vextract.s32 r6, x2, r16
312-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
313-
; CHECK-NEXT: mova r16, #6
314-
; CHECK-NEXT: vextract.s32 r7, x2, r16
315-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
316-
; CHECK-NEXT: mova r16, #7
317-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
318-
; CHECK-NEXT: vextract.s32 r8, x2, r16
319-
; CHECK-NEXT: add r16, r0, #-1
320-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0
321-
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
322-
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
323-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0
324-
; CHECK-NEXT: vpush.lo.32 x0, r5, x0
325-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0
326-
; CHECK-NEXT: vpush.lo.32 x0, r3, x0
327-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0
328-
; CHECK-NEXT: vpush.lo.32 x0, r1, x0
329-
; CHECK-NEXT: vpush.lo.32 x2, r8, x0
330-
; CHECK-NEXT: vpush.lo.32 x2, r7, x2
331-
; CHECK-NEXT: vpush.lo.32 x2, r6, x2
332-
; CHECK-NEXT: vpush.lo.32 x2, r5, x2
333-
; CHECK-NEXT: vpush.lo.32 x2, r4, x2
334-
; CHECK-NEXT: vpush.lo.32 x2, r3, x2
335-
; CHECK-NEXT: vpush.lo.32 x2, r2, x2
336-
; CHECK-NEXT: vpush.lo.32 x2, r1, x2
337-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
338-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
339-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
340-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
341-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2
229+
; CHECK-NEXT: nopx ; mov r1, r16
342230
; CHECK-NEXT: ret lr
343-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 5
344-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 4
345-
; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 3
346-
; CHECK-NEXT: vsel.32 x0, x0, x2, r16 // Delay Slot 2
347-
; CHECK-NEXT: mov r16, r9 // Delay Slot 1
231+
; CHECK-NEXT: eqz r0, r0 // Delay Slot 5
232+
; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 4
233+
; CHECK-NEXT: add r16, r0, #-1 // Delay Slot 3
234+
; CHECK-NEXT: vsel.32 x0, x2, x0, r16 // Delay Slot 2
235+
; CHECK-NEXT: mov r16, r1 // Delay Slot 1
348236
entry:
349237
%cmp = icmp eq i32 %idx, 0
350238
%shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>

0 commit comments

Comments
 (0)