@@ -12,15 +12,13 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
1212;
1313; CHECK-NODOT-LABEL: udot:
1414; CHECK-NODOT: // %bb.0:
15- ; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0
16- ; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0
17- ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
18- ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
19- ; CHECK-NODOT-NEXT: umlal v0.4s, v4.4h, v3.4h
20- ; CHECK-NODOT-NEXT: umull v5.4s, v2.4h, v1.4h
21- ; CHECK-NODOT-NEXT: umlal2 v0.4s, v2.8h, v1.8h
22- ; CHECK-NODOT-NEXT: umlal2 v5.4s, v4.8h, v3.8h
23- ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
15+ ; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
16+ ; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b
17+ ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
18+ ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h
19+ ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h
20+ ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
21+ ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
2422; CHECK-NODOT-NEXT: ret
2523 %u.wide = zext <16 x i8 > %u to <16 x i32 >
2624 %s.wide = zext <16 x i8 > %s to <16 x i32 >
@@ -52,20 +50,18 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
5250; CHECK-NODOT-NEXT: mov x8, xzr
5351; CHECK-NODOT-NEXT: .LBB1_1: // %vector.body
5452; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1
55- ; CHECK-NODOT-NEXT: ldr q0, [x1 , x8]
56- ; CHECK-NODOT-NEXT: ldr q2, [x0 , x8]
53+ ; CHECK-NODOT-NEXT: ldr q0, [x0 , x8]
54+ ; CHECK-NODOT-NEXT: ldr q2, [x1 , x8]
5755; CHECK-NODOT-NEXT: add x8, x8, #16
5856; CHECK-NODOT-NEXT: cmp x8, #16
59- ; CHECK-NODOT-NEXT: ushll2 v3.8h, v0.16b, #0
60- ; CHECK-NODOT-NEXT: ushll2 v4.8h, v2.16b, #0
61- ; CHECK-NODOT-NEXT: ushll v5.8h, v0.8b, #0
62- ; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
57+ ; CHECK-NODOT-NEXT: umull v3.8h, v0.8b, v2.8b
58+ ; CHECK-NODOT-NEXT: umull2 v2.8h, v0.16b, v2.16b
6359; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
64- ; CHECK-NODOT-NEXT: umull v6 .4s, v4 .4h, v3.4h
65- ; CHECK-NODOT-NEXT: umlal v1 .4s, v2.4h, v5 .4h
66- ; CHECK-NODOT-NEXT: umlal2 v6 .4s, v2.8h, v5 .8h
67- ; CHECK-NODOT-NEXT: umlal2 v1 .4s, v4.8h, v3 .8h
68- ; CHECK-NODOT-NEXT: add v1.4s, v6 .4s, v1 .4s
60+ ; CHECK-NODOT-NEXT: ushll v1 .4s, v2 .4h, #0
61+ ; CHECK-NODOT-NEXT: uaddw v4 .4s, v0.4s, v3 .4h
62+ ; CHECK-NODOT-NEXT: uaddw2 v1 .4s, v1.4s, v3 .8h
63+ ; CHECK-NODOT-NEXT: uaddw2 v2 .4s, v4.4s, v2 .8h
64+ ; CHECK-NODOT-NEXT: add v1.4s, v1 .4s, v2 .4s
6965; CHECK-NODOT-NEXT: b.ne .LBB1_1
7066; CHECK-NODOT-NEXT: // %bb.2: // %end
7167; CHECK-NODOT-NEXT: ret
@@ -99,19 +95,17 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
9995;
10096; CHECK-NODOT-LABEL: udot_narrow:
10197; CHECK-NODOT: // %bb.0:
102- ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
103- ; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
98+ ; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
10499; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
105- ; CHECK-NODOT-NEXT: umull v3.4s, v2.4h, v1.4h
106- ; CHECK-NODOT-NEXT: umull2 v4.4s, v2.8h, v1.8h
107- ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
108- ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
109- ; CHECK-NODOT-NEXT: umlal v0.4s, v2.4h, v1.4h
100+ ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
101+ ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
102+ ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
103+ ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
110104; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
111- ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
112- ; CHECK-NODOT-NEXT: umlal v3.4s, v6.4h, v5.4h
113- ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
105+ ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
114106; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
107+ ; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
108+ ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
115109; CHECK-NODOT-NEXT: ret
116110 %u.wide = zext <8 x i8 > %u to <8 x i32 >
117111 %s.wide = zext <8 x i8 > %s to <8 x i32 >
@@ -128,15 +122,13 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
128122;
129123; CHECK-NODOT-LABEL: sdot:
130124; CHECK-NODOT: // %bb.0:
131- ; CHECK-NODOT-NEXT: sshll v3.8h, v1.8b, #0
132- ; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0
133- ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
134- ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
135- ; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h
136- ; CHECK-NODOT-NEXT: smull v5.4s, v2.4h, v1.4h
137- ; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h
138- ; CHECK-NODOT-NEXT: smlal2 v5.4s, v4.8h, v3.8h
139- ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
125+ ; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
126+ ; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b
127+ ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
128+ ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h
129+ ; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h
130+ ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
131+ ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
140132; CHECK-NODOT-NEXT: ret
141133 %u.wide = sext <16 x i8 > %u to <16 x i32 >
142134 %s.wide = sext <16 x i8 > %s to <16 x i32 >
@@ -153,19 +145,17 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
153145;
154146; CHECK-NODOT-LABEL: sdot_narrow:
155147; CHECK-NODOT: // %bb.0:
156- ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
157- ; CHECK-NODOT-NEXT: sshll v2.8h, v2.8b, #0
148+ ; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
158149; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
159- ; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h
160- ; CHECK-NODOT-NEXT: smull2 v4.4s, v2.8h, v1.8h
161- ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
162- ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
163- ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h
150+ ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
151+ ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
152+ ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
153+ ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
164154; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
165- ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
166- ; CHECK-NODOT-NEXT: smlal v3.4s, v6.4h, v5.4h
167- ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
155+ ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
168156; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
157+ ; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
158+ ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
169159; CHECK-NODOT-NEXT: ret
170160 %u.wide = sext <8 x i8 > %u to <8 x i32 >
171161 %s.wide = sext <8 x i8 > %s to <8 x i32 >
@@ -417,27 +407,19 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
417407;
418408; CHECK-NODOT-LABEL: udot_8to64:
419409; CHECK-NODOT: // %bb.0: // %entry
420- ; CHECK-NODOT-NEXT: ushll v4.8h, v3.8b, #0
421- ; CHECK-NODOT-NEXT: ushll v5.8h, v2.8b, #0
422- ; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0
423- ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
424- ; CHECK-NODOT-NEXT: ushll v6.4s, v4.4h, #0
425- ; CHECK-NODOT-NEXT: ushll v7.4s, v5.4h, #0
410+ ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b
411+ ; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b
412+ ; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0
413+ ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
426414; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0
427- ; CHECK-NODOT-NEXT: ushll2 v5.4s, v5.8h, #0
428- ; CHECK-NODOT-NEXT: ushll2 v16.4s, v3.8h, #0
429- ; CHECK-NODOT-NEXT: ushll2 v17.4s, v2.8h, #0
430- ; CHECK-NODOT-NEXT: ushll v3.4s, v3.4h, #0
431- ; CHECK-NODOT-NEXT: ushll v2.4s, v2.4h, #0
432- ; CHECK-NODOT-NEXT: umlal2 v1.2d, v7.4s, v6.4s
433- ; CHECK-NODOT-NEXT: umlal v0.2d, v7.2s, v6.2s
434- ; CHECK-NODOT-NEXT: umull2 v18.2d, v5.4s, v4.4s
435- ; CHECK-NODOT-NEXT: umull v4.2d, v5.2s, v4.2s
436- ; CHECK-NODOT-NEXT: umlal2 v1.2d, v17.4s, v16.4s
437- ; CHECK-NODOT-NEXT: umlal v0.2d, v17.2s, v16.2s
438- ; CHECK-NODOT-NEXT: umlal2 v18.2d, v2.4s, v3.4s
439- ; CHECK-NODOT-NEXT: umlal v4.2d, v2.2s, v3.2s
440- ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
415+ ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
416+ ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s
417+ ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
418+ ; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s
419+ ; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s
420+ ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
421+ ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
422+ ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
441423; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
442424; CHECK-NODOT-NEXT: ret
443425entry:
@@ -460,27 +442,19 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
460442;
461443; CHECK-NODOT-LABEL: sdot_8to64:
462444; CHECK-NODOT: // %bb.0: // %entry
463- ; CHECK-NODOT-NEXT: sshll v4.8h, v3.8b, #0
464- ; CHECK-NODOT-NEXT: sshll v5.8h, v2.8b, #0
465- ; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0
466- ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
467- ; CHECK-NODOT-NEXT: sshll v6.4s, v4.4h, #0
468- ; CHECK-NODOT-NEXT: sshll v7.4s, v5.4h, #0
445+ ; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b
446+ ; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b
447+ ; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0
448+ ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
469449; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0
470- ; CHECK-NODOT-NEXT: sshll2 v5.4s, v5.8h, #0
471- ; CHECK-NODOT-NEXT: sshll2 v16.4s, v3.8h, #0
472- ; CHECK-NODOT-NEXT: sshll2 v17.4s, v2.8h, #0
473- ; CHECK-NODOT-NEXT: sshll v3.4s, v3.4h, #0
474- ; CHECK-NODOT-NEXT: sshll v2.4s, v2.4h, #0
475- ; CHECK-NODOT-NEXT: smlal2 v1.2d, v7.4s, v6.4s
476- ; CHECK-NODOT-NEXT: smlal v0.2d, v7.2s, v6.2s
477- ; CHECK-NODOT-NEXT: smull2 v18.2d, v5.4s, v4.4s
478- ; CHECK-NODOT-NEXT: smull v4.2d, v5.2s, v4.2s
479- ; CHECK-NODOT-NEXT: smlal2 v1.2d, v17.4s, v16.4s
480- ; CHECK-NODOT-NEXT: smlal v0.2d, v17.2s, v16.2s
481- ; CHECK-NODOT-NEXT: smlal2 v18.2d, v2.4s, v3.4s
482- ; CHECK-NODOT-NEXT: smlal v4.2d, v2.2s, v3.2s
483- ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
450+ ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
451+ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s
452+ ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
453+ ; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s
454+ ; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s
455+ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
456+ ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
457+ ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
484458; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
485459; CHECK-NODOT-NEXT: ret
486460entry:
@@ -797,10 +771,9 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
797771define <4 x i32 > @not_udot (<4 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
798772; CHECK-LABEL: not_udot:
799773; CHECK: // %bb.0:
800- ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
801- ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
802- ; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
803- ; CHECK-NEXT: umlal2 v0.4s, v2.8h, v1.8h
774+ ; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b
775+ ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
776+ ; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
804777; CHECK-NEXT: ret
805778 %u.wide = zext <8 x i8 > %u to <8 x i32 >
806779 %s.wide = zext <8 x i8 > %s to <8 x i32 >
0 commit comments