Skip to content

Commit 039e713

Browse files
committed
Avoid the vector SExt/ZExt in case indexed operation apply when using all elements in a vector
1 parent 6bb2f90 commit 039e713

File tree

2 files changed

+193
-0
lines changed

2 files changed

+193
-0
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18602,6 +18602,33 @@ static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
1860218602
SeenZExtOrSExt = true;
1860318603
}
1860418604

18605+
// Avoid the said use of vector SExt/ZExt in case all vector elements are
18606+
// consumed and each shuffle's mask uses same index, in order to permit use of
18607+
// indexed OP (e.g. MLA, MUL) variants
18608+
EVT ExtendType = Extend->getValueType(0);
18609+
if (ExtendType.isVector() && !ExtendType.isScalableVT()) {
18610+
const int NumElements = ExtendType.getVectorNumElements();
18611+
SmallBitVector UsedElements(NumElements, false);
18612+
for (auto UI = Extend.getNode()->use_begin(),
18613+
UE = Extend.getNode()->use_end();
18614+
UI != UE; ++UI) {
18615+
SDNode *User = UI->getUser();
18616+
if (User->getOpcode() == ISD::VECTOR_SHUFFLE &&
18617+
User->getOperand(0) == Extend) {
18618+
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(User)->getMask();
18619+
const int Idx = Mask[0];
18620+
if (Idx >= NumElements)
18621+
continue;
18622+
if (llvm::all_of(Mask, [Idx](int M) { return M == Idx; }))
18623+
UsedElements.set(Idx);
18624+
else
18625+
break; // early loop exit to help performance
18626+
}
18627+
}
18628+
if (UsedElements.all())
18629+
return SDValue();
18630+
}
18631+
1860518632
SDValue NBV;
1860618633
SDLoc DL(BV);
1860718634
if (BV.getOpcode() == ISD::BUILD_VECTOR) {
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK-SD
3+
; RUN: llc < %s -mtriple aarch64-none-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=CHECK-GI
4+
define <4 x i32> @ext_shuffle_v4i16_v4i32(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
5+
; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32:
6+
; CHECK-SD: // %bb.0:
7+
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
8+
; CHECK-SD-NEXT: mov v3.16b, v2.16b
9+
; CHECK-SD-NEXT: mov v4.16b, v2.16b
10+
; CHECK-SD-NEXT: mov v5.16b, v2.16b
11+
; CHECK-SD-NEXT: mla v3.4s, v1.4s, v0.s[0]
12+
; CHECK-SD-NEXT: mla v4.4s, v1.4s, v0.s[1]
13+
; CHECK-SD-NEXT: mla v2.4s, v1.4s, v0.s[3]
14+
; CHECK-SD-NEXT: mla v5.4s, v1.4s, v0.s[2]
15+
; CHECK-SD-NEXT: sub v0.4s, v3.4s, v4.4s
16+
; CHECK-SD-NEXT: sub v1.4s, v2.4s, v5.4s
17+
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
18+
; CHECK-SD-NEXT: ret
19+
;
20+
; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32:
21+
; CHECK-GI: // %bb.0:
22+
; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
23+
; CHECK-GI-NEXT: mov v3.16b, v2.16b
24+
; CHECK-GI-NEXT: mov v4.16b, v2.16b
25+
; CHECK-GI-NEXT: mov v5.16b, v2.16b
26+
; CHECK-GI-NEXT: mla v3.4s, v1.4s, v0.s[0]
27+
; CHECK-GI-NEXT: mla v4.4s, v1.4s, v0.s[1]
28+
; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.s[3]
29+
; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.s[2]
30+
; CHECK-GI-NEXT: sub v0.4s, v3.4s, v4.4s
31+
; CHECK-GI-NEXT: sub v1.4s, v5.4s, v2.4s
32+
; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s
33+
; CHECK-GI-NEXT: ret
34+
%lanes = sext <4 x i16> %l to <4 x i32>
35+
%shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
36+
%mul0 = mul <4 x i32> %shf0, %a
37+
%add0 = add <4 x i32> %mul0, %b
38+
%shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
39+
%mul1 = mul <4 x i32> %shf1, %a
40+
%add1 = add <4 x i32> %mul1, %b
41+
%shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
42+
%mul2 = mul <4 x i32> %shf2, %a
43+
%add2 = add <4 x i32> %mul2, %b
44+
%shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
45+
%mul3 = mul <4 x i32> %shf3, %a
46+
%add3 = add <4 x i32> %mul3, %b
47+
%sub1 = sub <4 x i32> %add0, %add1
48+
%sub2 = sub <4 x i32> %add2, %add3
49+
%sub3 = sub <4 x i32> %sub1, %sub2
50+
ret <4 x i32> %sub3
51+
}
52+
53+
define <4 x i32> @ext_shuffle_v4i16_v4i32_partial(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
54+
; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_partial:
55+
; CHECK-SD: // %bb.0:
56+
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
57+
; CHECK-SD-NEXT: dup v3.4h, v0.h[0]
58+
; CHECK-SD-NEXT: dup v4.4h, v0.h[1]
59+
; CHECK-SD-NEXT: mov v5.16b, v2.16b
60+
; CHECK-SD-NEXT: dup v0.4h, v0.h[2]
61+
; CHECK-SD-NEXT: mov v6.16b, v2.16b
62+
; CHECK-SD-NEXT: sshll v3.4s, v3.4h, #0
63+
; CHECK-SD-NEXT: sshll v4.4s, v4.4h, #0
64+
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
65+
; CHECK-SD-NEXT: mla v5.4s, v3.4s, v1.4s
66+
; CHECK-SD-NEXT: mla v6.4s, v4.4s, v1.4s
67+
; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s
68+
; CHECK-SD-NEXT: sub v0.4s, v5.4s, v6.4s
69+
; CHECK-SD-NEXT: sub v0.4s, v0.4s, v2.4s
70+
; CHECK-SD-NEXT: ret
71+
;
72+
; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_partial:
73+
; CHECK-GI: // %bb.0:
74+
; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
75+
; CHECK-GI-NEXT: mov v3.16b, v2.16b
76+
; CHECK-GI-NEXT: mov v4.16b, v2.16b
77+
; CHECK-GI-NEXT: mla v3.4s, v1.4s, v0.s[0]
78+
; CHECK-GI-NEXT: mla v4.4s, v1.4s, v0.s[1]
79+
; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.s[2]
80+
; CHECK-GI-NEXT: sub v0.4s, v3.4s, v4.4s
81+
; CHECK-GI-NEXT: sub v0.4s, v0.4s, v2.4s
82+
; CHECK-GI-NEXT: ret
83+
%lanes = sext <4 x i16> %l to <4 x i32>
84+
%shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
85+
%mul0 = mul <4 x i32> %shf0, %a
86+
%add0 = add <4 x i32> %mul0, %b
87+
%shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
88+
%mul1 = mul <4 x i32> %shf1, %a
89+
%add1 = add <4 x i32> %mul1, %b
90+
%shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
91+
%mul2 = mul <4 x i32> %shf2, %a
92+
%add2 = add <4 x i32> %mul2, %b
93+
%sub1 = sub <4 x i32> %add0, %add1
94+
%sub3 = sub <4 x i32> %sub1, %add2
95+
ret <4 x i32> %sub3
96+
}
97+
98+
define <4 x i32> @ext_shuffle_v4i16_v4i32_add(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
99+
; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_add:
100+
; CHECK-SD: // %bb.0:
101+
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
102+
; CHECK-SD-NEXT: dup v1.4s, v0.s[0]
103+
; CHECK-SD-NEXT: dup v3.4s, v0.s[1]
104+
; CHECK-SD-NEXT: dup v4.4s, v0.s[2]
105+
; CHECK-SD-NEXT: dup v0.4s, v0.s[3]
106+
; CHECK-SD-NEXT: add v1.4s, v1.4s, v2.4s
107+
; CHECK-SD-NEXT: add v3.4s, v3.4s, v2.4s
108+
; CHECK-SD-NEXT: add v4.4s, v4.4s, v2.4s
109+
; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s
110+
; CHECK-SD-NEXT: sub v1.4s, v1.4s, v3.4s
111+
; CHECK-SD-NEXT: sub v0.4s, v0.4s, v4.4s
112+
; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
113+
; CHECK-SD-NEXT: ret
114+
;
115+
; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_add:
116+
; CHECK-GI: // %bb.0:
117+
; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
118+
; CHECK-GI-NEXT: dup v1.4s, v0.s[0]
119+
; CHECK-GI-NEXT: dup v3.4s, v0.s[1]
120+
; CHECK-GI-NEXT: dup v4.4s, v0.s[2]
121+
; CHECK-GI-NEXT: dup v0.4s, v0.s[3]
122+
; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
123+
; CHECK-GI-NEXT: add v3.4s, v3.4s, v2.4s
124+
; CHECK-GI-NEXT: add v4.4s, v4.4s, v2.4s
125+
; CHECK-GI-NEXT: add v0.4s, v0.4s, v2.4s
126+
; CHECK-GI-NEXT: sub v1.4s, v1.4s, v3.4s
127+
; CHECK-GI-NEXT: sub v0.4s, v4.4s, v0.4s
128+
; CHECK-GI-NEXT: sub v0.4s, v1.4s, v0.4s
129+
; CHECK-GI-NEXT: ret
130+
%lanes = sext <4 x i16> %l to <4 x i32>
131+
%shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
132+
%add0 = add <4 x i32> %shf0, %b
133+
%shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
134+
%add1 = add <4 x i32> %shf1, %b
135+
%shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
136+
%add2 = add <4 x i32> %shf2, %b
137+
%shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
138+
%add3 = add <4 x i32> %shf3, %b
139+
%sub1 = sub <4 x i32> %add0, %add1
140+
%sub2 = sub <4 x i32> %add2, %add3
141+
%sub3 = sub <4 x i32> %sub1, %sub2
142+
ret <4 x i32> %sub3
143+
}
144+
145+
define <4 x i32> @ext_shuffle_v4i16_v4i32_one(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
146+
; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_one:
147+
; CHECK-SD: // %bb.0:
148+
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
149+
; CHECK-SD-NEXT: dup v0.4h, v0.h[3]
150+
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
151+
; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s
152+
; CHECK-SD-NEXT: mov v0.16b, v2.16b
153+
; CHECK-SD-NEXT: ret
154+
;
155+
; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_one:
156+
; CHECK-GI: // %bb.0:
157+
; CHECK-GI-NEXT: sshll v3.4s, v0.4h, #0
158+
; CHECK-GI-NEXT: mov v0.16b, v2.16b
159+
; CHECK-GI-NEXT: mla v0.4s, v1.4s, v3.s[3]
160+
; CHECK-GI-NEXT: ret
161+
%lanes = sext <4 x i16> %l to <4 x i32>
162+
%shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
163+
%mul3 = mul <4 x i32> %shf3, %a
164+
%add3 = add <4 x i32> %mul3, %b
165+
ret <4 x i32> %add3
166+
}

0 commit comments

Comments
 (0)