Skip to content

Commit 46aa3c6

Browse files
committed
[DAG] visitVECTOR_SHUFFLE - MergeInnerShuffle - improve shuffle(shuffle(x,y),shuffle(x,y)) merging
MergeInnerShuffle currently attempts to merge shuffle(shuffle(x,y),z) patterns into a single shuffle, using 1 or 2 of the x,y,z ops. However if we already match 2 ops we might be able to handle the third op if its also a shuffle that references one of the previous ops, allowing us to handle some cases like: shuffle(shuffle(x,y),shuffle(x,y)) shuffle(shuffle(shuffle(x,z),y),z) shuffle(shuffle(x,shuffle(x,y)),z) etc. This isn't an exhaustive match and is dependent on the order the candidate ops are encountered - if one of the matched ops was a shuffle that was peek-able we don't go back and try to split that, I haven't found much need for that amount of analysis yet. This is a preliminary patch that will allow us to later improve x86 HADD/HSUB matching - but needs to be reviewed separately as its in generic code and affects existing Thumb2 tests. Differential Revision: https://reviews.llvm.org/D94671
1 parent 17d0fb7 commit 46aa3c6

File tree

3 files changed

+47
-43
lines changed

3 files changed

+47
-43
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20887,6 +20887,32 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
2088720887
continue;
2088820888
}
2088920889

20890+
// Last chance - see if the vector is another shuffle and if it
20891+
// uses one of the existing candidate shuffle ops.
20892+
if (auto *CurrentSVN = dyn_cast<ShuffleVectorSDNode>(CurrentVec)) {
20893+
int InnerIdx = CurrentSVN->getMaskElt(Idx);
20894+
if (InnerIdx < 0) {
20895+
Mask.push_back(-1);
20896+
continue;
20897+
}
20898+
SDValue InnerVec = (InnerIdx < (int)NumElts)
20899+
? CurrentSVN->getOperand(0)
20900+
: CurrentSVN->getOperand(1);
20901+
if (InnerVec.isUndef()) {
20902+
Mask.push_back(-1);
20903+
continue;
20904+
}
20905+
InnerIdx %= NumElts;
20906+
if (InnerVec == SV0) {
20907+
Mask.push_back(InnerIdx);
20908+
continue;
20909+
}
20910+
if (InnerVec == SV1) {
20911+
Mask.push_back(InnerIdx + NumElts);
20912+
continue;
20913+
}
20914+
}
20915+
2089020916
// Bail out if we cannot convert the shuffle pair into a single shuffle.
2089120917
return false;
2089220918
}

llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,37 +10,26 @@ define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noa
1010
; CHECK-NEXT: blt .LBB0_2
1111
; CHECK-NEXT: .LBB0_1: @ %vector.body
1212
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
13-
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
14-
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
13+
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
14+
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
1515
; CHECK-NEXT: subs r3, #4
16-
; CHECK-NEXT: vmullt.s32 q0, q2, q1
17-
; CHECK-NEXT: vmullb.s32 q3, q2, q1
18-
; CHECK-NEXT: vmov r5, s3
19-
; CHECK-NEXT: vmov r12, s2
20-
; CHECK-NEXT: vmov r7, s1
16+
; CHECK-NEXT: vmullb.s32 q2, q1, q0
17+
; CHECK-NEXT: vmullt.s32 q3, q1, q0
18+
; CHECK-NEXT: vmov r5, s11
19+
; CHECK-NEXT: vmov r12, s10
2120
; CHECK-NEXT: lsrl r12, r5, #31
22-
; CHECK-NEXT: vmov r4, s0
23-
; CHECK-NEXT: lsrl r4, r7, #31
24-
; CHECK-NEXT: vmov q0[2], q0[0], r4, r12
25-
; CHECK-NEXT: vmov r12, s14
26-
; CHECK-NEXT: vmov q0[3], q0[1], r7, r5
21+
; CHECK-NEXT: vmov r4, s8
22+
; CHECK-NEXT: vmov r5, s9
23+
; CHECK-NEXT: lsrl r4, r5, #31
24+
; CHECK-NEXT: vmov q2[2], q2[0], r4, r12
2725
; CHECK-NEXT: vmov r5, s15
28-
; CHECK-NEXT: vmov r7, s13
26+
; CHECK-NEXT: vmov r12, s14
2927
; CHECK-NEXT: lsrl r12, r5, #31
3028
; CHECK-NEXT: vmov r4, s12
31-
; CHECK-NEXT: lsrl r4, r7, #31
32-
; CHECK-NEXT: vmov q1[2], q1[0], r4, r12
33-
; CHECK-NEXT: vmov q1[3], q1[1], r7, r5
34-
; CHECK-NEXT: vmov.f32 s8, s6
35-
; CHECK-NEXT: vmov.f32 s9, s7
36-
; CHECK-NEXT: vmov.f32 s6, s0
37-
; CHECK-NEXT: vmov.f32 s7, s1
38-
; CHECK-NEXT: vmov.f32 s10, s2
39-
; CHECK-NEXT: vmov.f32 s5, s6
40-
; CHECK-NEXT: vmov.f32 s11, s3
41-
; CHECK-NEXT: vmov.f32 s6, s8
42-
; CHECK-NEXT: vmov.f32 s7, s10
43-
; CHECK-NEXT: vstrb.8 q1, [r2], #16
29+
; CHECK-NEXT: vmov r5, s13
30+
; CHECK-NEXT: lsrl r4, r5, #31
31+
; CHECK-NEXT: vmov q2[3], q2[1], r4, r12
32+
; CHECK-NEXT: vstrb.8 q2, [r2], #16
4433
; CHECK-NEXT: bne .LBB0_1
4534
; CHECK-NEXT: .LBB0_2: @ %for.cond.cleanup
4635
; CHECK-NEXT: pop {r4, r5, r7, pc}

llvm/test/CodeGen/Thumb2/mve-vst2.ll

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,10 @@
66
define void @vst2_v2i32(<2 x i32> *%src, <4 x i32> *%dst) {
77
; CHECK-LABEL: vst2_v2i32:
88
; CHECK: @ %bb.0: @ %entry
9-
; CHECK-NEXT: ldrd r12, r3, [r0]
10-
; CHECK-NEXT: ldrd r2, r0, [r0, #8]
11-
; CHECK-NEXT: vmov q0[2], q0[0], r12, r3
12-
; CHECK-NEXT: vmov.f64 d2, d1
13-
; CHECK-NEXT: vmov q2[2], q2[0], r2, r0
14-
; CHECK-NEXT: vmov.f32 s5, s3
15-
; CHECK-NEXT: vmov.f32 s2, s8
16-
; CHECK-NEXT: vmov.f32 s3, s9
17-
; CHECK-NEXT: vmov.f32 s6, s10
18-
; CHECK-NEXT: vmov.f32 s1, s2
19-
; CHECK-NEXT: vmov.f32 s7, s11
20-
; CHECK-NEXT: vmov.f32 s2, s4
21-
; CHECK-NEXT: vmov.f32 s3, s6
9+
; CHECK-NEXT: ldm.w r0, {r2, r3, r12}
10+
; CHECK-NEXT: ldr r0, [r0, #12]
11+
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
12+
; CHECK-NEXT: vmov q0[3], q0[1], r12, r0
2213
; CHECK-NEXT: vstrw.32 q0, [r1]
2314
; CHECK-NEXT: bx lr
2415
entry:
@@ -333,11 +324,9 @@ define void @vst2_v2f32(<2 x float> *%src, <4 x float> *%dst) {
333324
; CHECK-LABEL: vst2_v2f32:
334325
; CHECK: @ %bb.0: @ %entry
335326
; CHECK-NEXT: vldr s0, [r0]
336-
; CHECK-NEXT: vldr s4, [r0, #4]
327+
; CHECK-NEXT: vldr s2, [r0, #4]
337328
; CHECK-NEXT: vldr s1, [r0, #8]
338-
; CHECK-NEXT: vldr s5, [r0, #12]
339-
; CHECK-NEXT: vmov.f32 s2, s4
340-
; CHECK-NEXT: vmov.f32 s3, s5
329+
; CHECK-NEXT: vldr s3, [r0, #12]
341330
; CHECK-NEXT: vstrw.32 q0, [r1]
342331
; CHECK-NEXT: bx lr
343332
entry:

0 commit comments

Comments
 (0)