@@ -12,13 +12,15 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
1212;
1313; CHECK-NODOT-LABEL: udot:
1414; CHECK-NODOT: // %bb.0:
15- ; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
16- ; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b
17- ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
18- ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h
19- ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h
20- ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
21- ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
15+ ; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0
16+ ; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0
17+ ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
18+ ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
19+ ; CHECK-NODOT-NEXT: umlal v0.4s, v4.4h, v3.4h
20+ ; CHECK-NODOT-NEXT: umull v5.4s, v2.4h, v1.4h
21+ ; CHECK-NODOT-NEXT: umlal2 v0.4s, v2.8h, v1.8h
22+ ; CHECK-NODOT-NEXT: umlal2 v5.4s, v4.8h, v3.8h
23+ ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
2224; CHECK-NODOT-NEXT: ret
2325 %u.wide = zext <16 x i8 > %u to <16 x i32 >
2426 %s.wide = zext <16 x i8 > %s to <16 x i32 >
@@ -35,17 +37,19 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
3537;
3638; CHECK-NODOT-LABEL: udot_narrow:
3739; CHECK-NODOT: // %bb.0:
38- ; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
40+ ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
41+ ; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
3942; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
40- ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
41- ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
42- ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
43- ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
43+ ; CHECK-NODOT-NEXT: umull v3.4s, v2.4h, v1.4h
44+ ; CHECK-NODOT-NEXT: umull2 v4.4s, v2.8h, v1.8h
45+ ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
46+ ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
47+ ; CHECK-NODOT-NEXT: umlal v0.4s, v2.4h, v1.4h
4448; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
45- ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
46- ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
47- ; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
49+ ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
50+ ; CHECK-NODOT-NEXT: umlal v3.4s, v6.4h, v5.4h
4851; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
52+ ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
4953; CHECK-NODOT-NEXT: ret
5054 %u.wide = zext <8 x i8 > %u to <8 x i32 >
5155 %s.wide = zext <8 x i8 > %s to <8 x i32 >
@@ -62,13 +66,15 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
6266;
6367; CHECK-NODOT-LABEL: sdot:
6468; CHECK-NODOT: // %bb.0:
65- ; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
66- ; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b
67- ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
68- ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h
69- ; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h
70- ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
71- ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
69+ ; CHECK-NODOT-NEXT: sshll v3.8h, v1.8b, #0
70+ ; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0
71+ ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
72+ ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
73+ ; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h
74+ ; CHECK-NODOT-NEXT: smull v5.4s, v2.4h, v1.4h
75+ ; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h
76+ ; CHECK-NODOT-NEXT: smlal2 v5.4s, v4.8h, v3.8h
77+ ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
7278; CHECK-NODOT-NEXT: ret
7379 %u.wide = sext <16 x i8 > %u to <16 x i32 >
7480 %s.wide = sext <16 x i8 > %s to <16 x i32 >
@@ -85,17 +91,19 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
8591;
8692; CHECK-NODOT-LABEL: sdot_narrow:
8793; CHECK-NODOT: // %bb.0:
88- ; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
94+ ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
95+ ; CHECK-NODOT-NEXT: sshll v2.8h, v2.8b, #0
8996; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
90- ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
91- ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
92- ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
93- ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
97+ ; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h
98+ ; CHECK-NODOT-NEXT: smull2 v4.4s, v2.8h, v1.8h
99+ ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
100+ ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
101+ ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h
94102; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
95- ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
96- ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
97- ; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
103+ ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
104+ ; CHECK-NODOT-NEXT: smlal v3.4s, v6.4h, v5.4h
98105; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
106+ ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
99107; CHECK-NODOT-NEXT: ret
100108 %u.wide = sext <8 x i8 > %u to <8 x i32 >
101109 %s.wide = sext <8 x i8 > %s to <8 x i32 >
@@ -223,19 +231,27 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
223231;
224232; CHECK-NODOT-LABEL: udot_8to64:
225233; CHECK-NODOT: // %bb.0: // %entry
226- ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b
227- ; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b
228- ; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0
229- ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
234+ ; CHECK-NODOT-NEXT: ushll v4.8h, v3.8b, #0
235+ ; CHECK-NODOT-NEXT: ushll v5.8h, v2.8b, #0
236+ ; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0
237+ ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
238+ ; CHECK-NODOT-NEXT: ushll v6.4s, v4.4h, #0
239+ ; CHECK-NODOT-NEXT: ushll v7.4s, v5.4h, #0
230240; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0
231- ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
232- ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s
233- ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
234- ; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s
235- ; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s
236- ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
237- ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
238- ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
241+ ; CHECK-NODOT-NEXT: ushll2 v5.4s, v5.8h, #0
242+ ; CHECK-NODOT-NEXT: ushll2 v16.4s, v3.8h, #0
243+ ; CHECK-NODOT-NEXT: ushll2 v17.4s, v2.8h, #0
244+ ; CHECK-NODOT-NEXT: ushll v3.4s, v3.4h, #0
245+ ; CHECK-NODOT-NEXT: ushll v2.4s, v2.4h, #0
246+ ; CHECK-NODOT-NEXT: umlal2 v1.2d, v7.4s, v6.4s
247+ ; CHECK-NODOT-NEXT: umlal v0.2d, v7.2s, v6.2s
248+ ; CHECK-NODOT-NEXT: umull2 v18.2d, v5.4s, v4.4s
249+ ; CHECK-NODOT-NEXT: umull v4.2d, v5.2s, v4.2s
250+ ; CHECK-NODOT-NEXT: umlal2 v1.2d, v17.4s, v16.4s
251+ ; CHECK-NODOT-NEXT: umlal v0.2d, v17.2s, v16.2s
252+ ; CHECK-NODOT-NEXT: umlal2 v18.2d, v2.4s, v3.4s
253+ ; CHECK-NODOT-NEXT: umlal v4.2d, v2.2s, v3.2s
254+ ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
239255; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
240256; CHECK-NODOT-NEXT: ret
241257entry:
@@ -258,19 +274,27 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
258274;
259275; CHECK-NODOT-LABEL: sdot_8to64:
260276; CHECK-NODOT: // %bb.0: // %entry
261- ; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b
262- ; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b
263- ; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0
264- ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
277+ ; CHECK-NODOT-NEXT: sshll v4.8h, v3.8b, #0
278+ ; CHECK-NODOT-NEXT: sshll v5.8h, v2.8b, #0
279+ ; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0
280+ ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
281+ ; CHECK-NODOT-NEXT: sshll v6.4s, v4.4h, #0
282+ ; CHECK-NODOT-NEXT: sshll v7.4s, v5.4h, #0
265283; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0
266- ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
267- ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s
268- ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
269- ; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s
270- ; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s
271- ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
272- ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
273- ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
284+ ; CHECK-NODOT-NEXT: sshll2 v5.4s, v5.8h, #0
285+ ; CHECK-NODOT-NEXT: sshll2 v16.4s, v3.8h, #0
286+ ; CHECK-NODOT-NEXT: sshll2 v17.4s, v2.8h, #0
287+ ; CHECK-NODOT-NEXT: sshll v3.4s, v3.4h, #0
288+ ; CHECK-NODOT-NEXT: sshll v2.4s, v2.4h, #0
289+ ; CHECK-NODOT-NEXT: smlal2 v1.2d, v7.4s, v6.4s
290+ ; CHECK-NODOT-NEXT: smlal v0.2d, v7.2s, v6.2s
291+ ; CHECK-NODOT-NEXT: smull2 v18.2d, v5.4s, v4.4s
292+ ; CHECK-NODOT-NEXT: smull v4.2d, v5.2s, v4.2s
293+ ; CHECK-NODOT-NEXT: smlal2 v1.2d, v17.4s, v16.4s
294+ ; CHECK-NODOT-NEXT: smlal v0.2d, v17.2s, v16.2s
295+ ; CHECK-NODOT-NEXT: smlal2 v18.2d, v2.4s, v3.4s
296+ ; CHECK-NODOT-NEXT: smlal v4.2d, v2.2s, v3.2s
297+ ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
274298; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
275299; CHECK-NODOT-NEXT: ret
276300entry:
@@ -531,9 +555,10 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
531555define <4 x i32 > @not_udot (<4 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
532556; CHECK-LABEL: not_udot:
533557; CHECK: // %bb.0:
534- ; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b
535- ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
536- ; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
558+ ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
559+ ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
560+ ; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
561+ ; CHECK-NEXT: umlal2 v0.4s, v2.8h, v1.8h
537562; CHECK-NEXT: ret
538563 %u.wide = zext <8 x i8 > %u to <8 x i32 >
539564 %s.wide = zext <8 x i8 > %s to <8 x i32 >
0 commit comments