@@ -12,15 +12,13 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
1212;
1313; CHECK-NODOT-LABEL: udot:
1414; CHECK-NODOT: // %bb.0:
15- ; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0
16- ; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0
17- ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
18- ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
19- ; CHECK-NODOT-NEXT: umlal v0.4s, v4.4h, v3.4h
20- ; CHECK-NODOT-NEXT: umull v5.4s, v2.4h, v1.4h
21- ; CHECK-NODOT-NEXT: umlal2 v0.4s, v2.8h, v1.8h
22- ; CHECK-NODOT-NEXT: umlal2 v5.4s, v4.8h, v3.8h
23- ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
15+ ; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
16+ ; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b
17+ ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
18+ ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h
19+ ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h
20+ ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
21+ ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
2422; CHECK-NODOT-NEXT: ret
2523 %u.wide = zext <16 x i8 > %u to <16 x i32 >
2624 %s.wide = zext <16 x i8 > %s to <16 x i32 >
@@ -37,19 +35,17 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
3735;
3836; CHECK-NODOT-LABEL: udot_narrow:
3937; CHECK-NODOT: // %bb.0:
40- ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
41- ; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
38+ ; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
4239; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
43- ; CHECK-NODOT-NEXT: umull v3.4s, v2.4h, v1.4h
44- ; CHECK-NODOT-NEXT: umull2 v4.4s, v2.8h, v1.8h
45- ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
46- ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
47- ; CHECK-NODOT-NEXT: umlal v0.4s, v2.4h, v1.4h
40+ ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
41+ ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
42+ ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
43+ ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
4844; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
49- ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
50- ; CHECK-NODOT-NEXT: umlal v3.4s, v6.4h, v5.4h
51- ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
45+ ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
5246; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
47+ ; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
48+ ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
5349; CHECK-NODOT-NEXT: ret
5450 %u.wide = zext <8 x i8 > %u to <8 x i32 >
5551 %s.wide = zext <8 x i8 > %s to <8 x i32 >
@@ -66,15 +62,13 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
6662;
6763; CHECK-NODOT-LABEL: sdot:
6864; CHECK-NODOT: // %bb.0:
69- ; CHECK-NODOT-NEXT: sshll v3.8h, v1.8b, #0
70- ; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0
71- ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
72- ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
73- ; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h
74- ; CHECK-NODOT-NEXT: smull v5.4s, v2.4h, v1.4h
75- ; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h
76- ; CHECK-NODOT-NEXT: smlal2 v5.4s, v4.8h, v3.8h
77- ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
65+ ; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
66+ ; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b
67+ ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
68+ ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h
69+ ; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h
70+ ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
71+ ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
7872; CHECK-NODOT-NEXT: ret
7973 %u.wide = sext <16 x i8 > %u to <16 x i32 >
8074 %s.wide = sext <16 x i8 > %s to <16 x i32 >
@@ -91,19 +85,17 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
9185;
9286; CHECK-NODOT-LABEL: sdot_narrow:
9387; CHECK-NODOT: // %bb.0:
94- ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
95- ; CHECK-NODOT-NEXT: sshll v2.8h, v2.8b, #0
88+ ; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
9689; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
97- ; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h
98- ; CHECK-NODOT-NEXT: smull2 v4.4s, v2.8h, v1.8h
99- ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
100- ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
101- ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h
90+ ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
91+ ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
92+ ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
93+ ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
10294; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
103- ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
104- ; CHECK-NODOT-NEXT: smlal v3.4s, v6.4h, v5.4h
105- ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
95+ ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
10696; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
97+ ; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
98+ ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
10799; CHECK-NODOT-NEXT: ret
108100 %u.wide = sext <8 x i8 > %u to <8 x i32 >
109101 %s.wide = sext <8 x i8 > %s to <8 x i32 >
@@ -231,27 +223,19 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
231223;
232224; CHECK-NODOT-LABEL: udot_8to64:
233225; CHECK-NODOT: // %bb.0: // %entry
234- ; CHECK-NODOT-NEXT: ushll v4.8h, v3.8b, #0
235- ; CHECK-NODOT-NEXT: ushll v5.8h, v2.8b, #0
236- ; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0
237- ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
238- ; CHECK-NODOT-NEXT: ushll v6.4s, v4.4h, #0
239- ; CHECK-NODOT-NEXT: ushll v7.4s, v5.4h, #0
226+ ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b
227+ ; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b
228+ ; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0
229+ ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
240230; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0
241- ; CHECK-NODOT-NEXT: ushll2 v5.4s, v5.8h, #0
242- ; CHECK-NODOT-NEXT: ushll2 v16.4s, v3.8h, #0
243- ; CHECK-NODOT-NEXT: ushll2 v17.4s, v2.8h, #0
244- ; CHECK-NODOT-NEXT: ushll v3.4s, v3.4h, #0
245- ; CHECK-NODOT-NEXT: ushll v2.4s, v2.4h, #0
246- ; CHECK-NODOT-NEXT: umlal2 v1.2d, v7.4s, v6.4s
247- ; CHECK-NODOT-NEXT: umlal v0.2d, v7.2s, v6.2s
248- ; CHECK-NODOT-NEXT: umull2 v18.2d, v5.4s, v4.4s
249- ; CHECK-NODOT-NEXT: umull v4.2d, v5.2s, v4.2s
250- ; CHECK-NODOT-NEXT: umlal2 v1.2d, v17.4s, v16.4s
251- ; CHECK-NODOT-NEXT: umlal v0.2d, v17.2s, v16.2s
252- ; CHECK-NODOT-NEXT: umlal2 v18.2d, v2.4s, v3.4s
253- ; CHECK-NODOT-NEXT: umlal v4.2d, v2.2s, v3.2s
254- ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
231+ ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
232+ ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s
233+ ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
234+ ; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s
235+ ; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s
236+ ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
237+ ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
238+ ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
255239; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
256240; CHECK-NODOT-NEXT: ret
257241entry:
@@ -274,27 +258,19 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
274258;
275259; CHECK-NODOT-LABEL: sdot_8to64:
276260; CHECK-NODOT: // %bb.0: // %entry
277- ; CHECK-NODOT-NEXT: sshll v4.8h, v3.8b, #0
278- ; CHECK-NODOT-NEXT: sshll v5.8h, v2.8b, #0
279- ; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0
280- ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
281- ; CHECK-NODOT-NEXT: sshll v6.4s, v4.4h, #0
282- ; CHECK-NODOT-NEXT: sshll v7.4s, v5.4h, #0
261+ ; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b
262+ ; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b
263+ ; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0
264+ ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
283265; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0
284- ; CHECK-NODOT-NEXT: sshll2 v5.4s, v5.8h, #0
285- ; CHECK-NODOT-NEXT: sshll2 v16.4s, v3.8h, #0
286- ; CHECK-NODOT-NEXT: sshll2 v17.4s, v2.8h, #0
287- ; CHECK-NODOT-NEXT: sshll v3.4s, v3.4h, #0
288- ; CHECK-NODOT-NEXT: sshll v2.4s, v2.4h, #0
289- ; CHECK-NODOT-NEXT: smlal2 v1.2d, v7.4s, v6.4s
290- ; CHECK-NODOT-NEXT: smlal v0.2d, v7.2s, v6.2s
291- ; CHECK-NODOT-NEXT: smull2 v18.2d, v5.4s, v4.4s
292- ; CHECK-NODOT-NEXT: smull v4.2d, v5.2s, v4.2s
293- ; CHECK-NODOT-NEXT: smlal2 v1.2d, v17.4s, v16.4s
294- ; CHECK-NODOT-NEXT: smlal v0.2d, v17.2s, v16.2s
295- ; CHECK-NODOT-NEXT: smlal2 v18.2d, v2.4s, v3.4s
296- ; CHECK-NODOT-NEXT: smlal v4.2d, v2.2s, v3.2s
297- ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
266+ ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
267+ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s
268+ ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
269+ ; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s
270+ ; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s
271+ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
272+ ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
273+ ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
298274; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
299275; CHECK-NODOT-NEXT: ret
300276entry:
@@ -555,10 +531,9 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
555531define <4 x i32 > @not_udot (<4 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
556532; CHECK-LABEL: not_udot:
557533; CHECK: // %bb.0:
558- ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
559- ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
560- ; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
561- ; CHECK-NEXT: umlal2 v0.4s, v2.8h, v1.8h
534+ ; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b
535+ ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
536+ ; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
562537; CHECK-NEXT: ret
563538 %u.wide = zext <8 x i8 > %u to <8 x i32 >
564539 %s.wide = zext <8 x i8 > %s to <8 x i32 >
0 commit comments