Skip to content

Commit aceabde

Browse files
[AIE2] Add a pattern that combines the first halfs of two vectors
1 parent ccc47bc commit aceabde

File tree

3 files changed

+108
-89
lines changed

3 files changed

+108
-89
lines changed

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,35 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
414414
MI.eraseFromParent();
415415
return true;
416416
}
417+
418+
// {1, 2, ..., n/4, n/2, n/2+1, .... 3n/4} -> G_UNMERGE_VALUES
419+
// Take the first halfs of the two vectors and concatenate them into one
420+
// vector.
421+
std::function<std::optional<int32_t>()> FirstEightA =
422+
adderGenerator(0, (DstNumElts / 2) - 1, 1);
423+
std::function<std::optional<int32_t>()> FirstEightB =
424+
adderGenerator(DstNumElts, DstNumElts + (DstNumElts / 2) - 1, 1);
425+
426+
std::function<std::optional<int32_t>()> FirstAndThird =
427+
concatGenerators(SmallVector<std::function<std::optional<int32_t>()>>{
428+
FirstEightA, FirstEightB});
429+
if (matchCombineShuffleVectorSimple(MI, FirstAndThird,
430+
(DstNumElts / 2) - 1)) {
431+
if (DstNumElts <= 2)
432+
return false;
433+
const Register DstReg = MI.getOperand(0).getReg();
434+
const LLT HalfSrcTy =
435+
LLT::fixed_vector(SrcNumElts / 2, SrcTy.getScalarType());
436+
const Register HalfOfA =
437+
createUnmergeValue(MI, MI.getOperand(1).getReg(),
438+
MRI.createGenericVirtualRegister(HalfSrcTy), 0);
439+
const Register HalfOfB =
440+
createUnmergeValue(MI, MI.getOperand(2).getReg(),
441+
MRI.createGenericVirtualRegister(HalfSrcTy), 0);
442+
Builder.buildMergeLikeInstr(DstReg, {HalfOfA, HalfOfB});
443+
MI.eraseFromParent();
444+
return true;
445+
}
417446
return false;
418447
}
419448

llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,3 +264,66 @@ body: |
264264
%1:_(<128 x s8>) = COPY $y2
265265
%2:_(<4 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(4, 5, 6, 7)
266266
PseudoRET implicit $lr, implicit %2
267+
...
268+
269+
---
270+
name: insert_vector_16_elements
271+
legalized: false
272+
body: |
273+
bb.1.entry:
274+
liveins: $x0, $x1
275+
; CHECK-LABEL: name: insert_vector_16_elements
276+
; CHECK: liveins: $x0, $x1
277+
; CHECK-NEXT: {{ $}}
278+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0
279+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1
280+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
281+
; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>)
282+
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV]](<8 x s32>), [[UV2]](<8 x s32>)
283+
; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>)
284+
%1:_(<16 x s32>) = COPY $x0
285+
%2:_(<16 x s32>) = COPY $x1
286+
%3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23)
287+
PseudoRET implicit $lr, implicit %3
288+
...
289+
290+
---
291+
name: insert_vector_8_elements
292+
legalized: false
293+
body: |
294+
bb.1.entry:
295+
liveins: $wl0, $wl1
296+
; CHECK-LABEL: name: insert_vector_8_elements
297+
; CHECK: liveins: $wl0, $wl1
298+
; CHECK-NEXT: {{ $}}
299+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0
300+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1
301+
; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>)
302+
; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>)
303+
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR]](<4 x s32>), [[AIE_UNPAD_VECTOR1]](<4 x s32>)
304+
; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>)
305+
%1:_(<8 x s32>) = COPY $wl0
306+
%2:_(<8 x s32>) = COPY $wl1
307+
%3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(0, 1, 2, 3, 8, 9, 10, 11)
308+
PseudoRET implicit $lr, implicit %3
309+
...
310+
311+
---
312+
name: insert_vector_128_elements
313+
legalized: false
314+
body: |
315+
bb.1.entry:
316+
liveins: $y2, $y3
317+
; CHECK-LABEL: name: insert_vector_128_elements
318+
; CHECK: liveins: $y2, $y3
319+
; CHECK-NEXT: {{ $}}
320+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2
321+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3
322+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>)
323+
; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>)
324+
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV]](<64 x s8>), [[UV2]](<64 x s8>)
325+
; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>)
326+
%1:_(<128 x s8>) = COPY $y2
327+
%2:_(<128 x s8>) = COPY $y3
328+
%3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191)
329+
PseudoRET implicit $lr, implicit %3

llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll

Lines changed: 16 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -48,100 +48,27 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <
4848
; CHECK-LABEL: test_insert_vector:
4949
; CHECK: .p2align 4
5050
; CHECK-NEXT: // %bb.0: // %entry
51-
; CHECK-NEXT: nopa ; nopb ; jz r0, #.LBB1_2
52-
; CHECK-NEXT: mov r24, r16 // Delay Slot 5
53-
; CHECK-NEXT: mov r25, r17 // Delay Slot 4
54-
; CHECK-NEXT: mov r26, r18 // Delay Slot 3
55-
; CHECK-NEXT: mov r27, r19 // Delay Slot 2
56-
; CHECK-NEXT: mova r16, #0 // Delay Slot 1
51+
; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv
52+
; CHECK-NEXT: nopa ; nopx // Delay Slot 5
53+
; CHECK-NEXT: nop // Delay Slot 4
54+
; CHECK-NEXT: nop // Delay Slot 3
55+
; CHECK-NEXT: nop // Delay Slot 2
56+
; CHECK-NEXT: vmov wl0, wl4 // Delay Slot 1
5757
; CHECK-NEXT: // %bb.1: // %if.end
58-
; CHECK-NEXT: nopx ; vextract.s32 r0, x2, r16
59-
; CHECK-NEXT: nop
60-
; CHECK-NEXT: mova r16, #1
61-
; CHECK-NEXT: mova r17, #5
62-
; CHECK-NEXT: mova r19, #6
63-
; CHECK-NEXT: mova r18, #0
64-
; CHECK-NEXT: vextract.s32 r1, x2, r16
65-
; CHECK-NEXT: vextract.s32 r8, x4, r18
66-
; CHECK-NEXT: movx r16, #2
67-
; CHECK-NEXT: mova r18, #1
68-
; CHECK-NEXT: vextract.s32 r5, x2, r17
69-
; CHECK-NEXT: vextract.s32 r6, x2, r19
70-
; CHECK-NEXT: vextract.s32 r13, x4, r17
71-
; CHECK-NEXT: vextract.s32 r15, x4, r19
72-
; CHECK-NEXT: vextract.s32 r2, x2, r16
73-
; CHECK-NEXT: vextract.s32 r9, x4, r18
74-
; CHECK-NEXT: movx r16, #3
75-
; CHECK-NEXT: mova r18, #2
76-
; CHECK-NEXT: vextract.s32 r10, x4, r18
77-
; CHECK-NEXT: vextract.s32 r3, x2, r16
78-
; CHECK-NEXT: nop
79-
; CHECK-NEXT: mova r16, #4
80-
; CHECK-NEXT: vextract.s32 r4, x2, r16
81-
; CHECK-NEXT: movx r18, #3
82-
; CHECK-NEXT: mova r16, #7
83-
; CHECK-NEXT: vextract.s32 r11, x4, r18
84-
; CHECK-NEXT: j #.LBB1_3
85-
; CHECK-NEXT: mova r18, #4 // Delay Slot 5
86-
; CHECK-NEXT: vextract.s32 r7, x2, r16 // Delay Slot 4
87-
; CHECK-NEXT: vextract.s32 r12, x4, r18 // Delay Slot 3
88-
; CHECK-NEXT: vextract.s32 r14, x4, r16 // Delay Slot 2
58+
; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv
59+
; CHECK-NEXT: nopx // Delay Slot 5
60+
; CHECK-NEXT: vmov wh2, wl0 // Delay Slot 4
61+
; CHECK-NEXT: nop // Delay Slot 3
62+
; CHECK-NEXT: vmov x0, x2 // Delay Slot 2
8963
; CHECK-NEXT: nop // Delay Slot 1
9064
; CHECK-NEXT: .p2align 4
9165
; CHECK-NEXT: .LBB1_2: // %if.then
92-
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x4, r16; nopv
93-
; CHECK-NEXT: nop
94-
; CHECK-NEXT: mova r16, #1
95-
; CHECK-NEXT: mova r17, #5
96-
; CHECK-NEXT: mova r19, #6
97-
; CHECK-NEXT: mova r18, #0
98-
; CHECK-NEXT: vextract.s32 r1, x4, r16
99-
; CHECK-NEXT: vextract.s32 r8, x2, r18
100-
; CHECK-NEXT: movx r16, #2
101-
; CHECK-NEXT: mova r18, #1
102-
; CHECK-NEXT: vextract.s32 r5, x4, r17
103-
; CHECK-NEXT: vextract.s32 r6, x4, r19
104-
; CHECK-NEXT: vextract.s32 r13, x2, r17
105-
; CHECK-NEXT: vextract.s32 r15, x2, r19
106-
; CHECK-NEXT: vextract.s32 r2, x4, r16
107-
; CHECK-NEXT: vextract.s32 r9, x2, r18
108-
; CHECK-NEXT: movx r16, #3
109-
; CHECK-NEXT: mova r18, #2
110-
; CHECK-NEXT: vextract.s32 r3, x4, r16
111-
; CHECK-NEXT: vextract.s32 r10, x2, r18
112-
; CHECK-NEXT: movx r16, #4
113-
; CHECK-NEXT: mova r18, #3
114-
; CHECK-NEXT: vextract.s32 r4, x4, r16
115-
; CHECK-NEXT: vextract.s32 r11, x2, r18
116-
; CHECK-NEXT: movx r16, #7
117-
; CHECK-NEXT: mova r18, #4
118-
; CHECK-NEXT: vextract.s32 r7, x4, r16
119-
; CHECK-NEXT: vextract.s32 r12, x2, r18
120-
; CHECK-NEXT: vextract.s32 r14, x2, r16
121-
; CHECK-NEXT: nop
122-
; CHECK-NEXT: .p2align 4
123-
; CHECK-NEXT: .LBB1_3: // %cleanup
124-
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r19, r27; nopv
125-
; CHECK-NEXT: mov r18, r26
126-
; CHECK-NEXT: mov r17, r25
127-
; CHECK-NEXT: vpush.lo.32 x0, r14, x0
128-
; CHECK-NEXT: vpush.lo.32 x0, r15, x0
129-
; CHECK-NEXT: vpush.lo.32 x0, r13, x0
130-
; CHECK-NEXT: vpush.lo.32 x0, r12, x0
131-
; CHECK-NEXT: vpush.lo.32 x0, r11, x0
132-
; CHECK-NEXT: vpush.lo.32 x0, r10, x0
133-
; CHECK-NEXT: vpush.lo.32 x0, r9, x0
134-
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
135-
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
136-
; CHECK-NEXT: vpush.lo.32 x0, r6, x0
137-
; CHECK-NEXT: vpush.lo.32 x0, r5, x0
138-
; CHECK-NEXT: vpush.lo.32 x0, r4, x0
13966
; CHECK-NEXT: ret lr
140-
; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 5
141-
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 4
142-
; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 3
143-
; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2
144-
; CHECK-NEXT: mov r16, r24 // Delay Slot 1
67+
; CHECK-NEXT: nop // Delay Slot 5
68+
; CHECK-NEXT: nop // Delay Slot 4
69+
; CHECK-NEXT: nop // Delay Slot 3
70+
; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 2
71+
; CHECK-NEXT: nop // Delay Slot 1
14572
entry:
14673
%shuffle = shufflevector <8 x i32> %b, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
14774
%cmp = icmp eq i32 %idx, 0

0 commit comments

Comments
 (0)