@@ -12,13 +12,15 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
1212;
1313; CHECK-NODOT-LABEL: udot:
1414; CHECK-NODOT: // %bb.0:
15- ; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
16- ; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b
17- ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
18- ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h
19- ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h
20- ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
21- ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
15+ ; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0
16+ ; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0
17+ ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
18+ ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
19+ ; CHECK-NODOT-NEXT: umlal v0.4s, v4.4h, v3.4h
20+ ; CHECK-NODOT-NEXT: umull v5.4s, v2.4h, v1.4h
21+ ; CHECK-NODOT-NEXT: umlal2 v0.4s, v2.8h, v1.8h
22+ ; CHECK-NODOT-NEXT: umlal2 v5.4s, v4.8h, v3.8h
23+ ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
2224; CHECK-NODOT-NEXT: ret
2325 %u.wide = zext <16 x i8 > %u to <16 x i32 >
2426 %s.wide = zext <16 x i8 > %s to <16 x i32 >
@@ -95,17 +97,19 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
9597;
9698; CHECK-NODOT-LABEL: udot_narrow:
9799; CHECK-NODOT: // %bb.0:
98- ; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
100+ ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
101+ ; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
99102; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
100- ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
101- ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
102- ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
103- ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
103+ ; CHECK-NODOT-NEXT: umull v3.4s, v2.4h, v1.4h
104+ ; CHECK-NODOT-NEXT: umull2 v4.4s, v2.8h, v1.8h
105+ ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
106+ ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
107+ ; CHECK-NODOT-NEXT: umlal v0.4s, v2.4h, v1.4h
104108; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
105- ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
106- ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
107- ; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
109+ ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
110+ ; CHECK-NODOT-NEXT: umlal v3.4s, v6.4h, v5.4h
108111; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
112+ ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
109113; CHECK-NODOT-NEXT: ret
110114 %u.wide = zext <8 x i8 > %u to <8 x i32 >
111115 %s.wide = zext <8 x i8 > %s to <8 x i32 >
@@ -122,13 +126,15 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
122126;
123127; CHECK-NODOT-LABEL: sdot:
124128; CHECK-NODOT: // %bb.0:
125- ; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
126- ; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b
127- ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
128- ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h
129- ; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h
130- ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
131- ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
129+ ; CHECK-NODOT-NEXT: sshll v3.8h, v1.8b, #0
130+ ; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0
131+ ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
132+ ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
133+ ; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h
134+ ; CHECK-NODOT-NEXT: smull v5.4s, v2.4h, v1.4h
135+ ; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h
136+ ; CHECK-NODOT-NEXT: smlal2 v5.4s, v4.8h, v3.8h
137+ ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
132138; CHECK-NODOT-NEXT: ret
133139 %u.wide = sext <16 x i8 > %u to <16 x i32 >
134140 %s.wide = sext <16 x i8 > %s to <16 x i32 >
@@ -145,17 +151,19 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
145151;
146152; CHECK-NODOT-LABEL: sdot_narrow:
147153; CHECK-NODOT: // %bb.0:
148- ; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
154+ ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
155+ ; CHECK-NODOT-NEXT: sshll v2.8h, v2.8b, #0
149156; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
150- ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
151- ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
152- ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
153- ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
157+ ; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h
158+ ; CHECK-NODOT-NEXT: smull2 v4.4s, v2.8h, v1.8h
159+ ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
160+ ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
161+ ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h
154162; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
155- ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
156- ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
157- ; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
163+ ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
164+ ; CHECK-NODOT-NEXT: smlal v3.4s, v6.4h, v5.4h
158165; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
166+ ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
159167; CHECK-NODOT-NEXT: ret
160168 %u.wide = sext <8 x i8 > %u to <8 x i32 >
161169 %s.wide = sext <8 x i8 > %s to <8 x i32 >
@@ -407,19 +415,27 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
407415;
408416; CHECK-NODOT-LABEL: udot_8to64:
409417; CHECK-NODOT: // %bb.0: // %entry
410- ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b
411- ; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b
412- ; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0
413- ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
418+ ; CHECK-NODOT-NEXT: ushll v4.8h, v3.8b, #0
419+ ; CHECK-NODOT-NEXT: ushll v5.8h, v2.8b, #0
420+ ; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0
421+ ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
422+ ; CHECK-NODOT-NEXT: ushll v6.4s, v4.4h, #0
423+ ; CHECK-NODOT-NEXT: ushll v7.4s, v5.4h, #0
414424; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0
415- ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
416- ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s
417- ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
418- ; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s
419- ; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s
420- ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
421- ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
422- ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
425+ ; CHECK-NODOT-NEXT: ushll2 v5.4s, v5.8h, #0
426+ ; CHECK-NODOT-NEXT: ushll2 v16.4s, v3.8h, #0
427+ ; CHECK-NODOT-NEXT: ushll2 v17.4s, v2.8h, #0
428+ ; CHECK-NODOT-NEXT: ushll v3.4s, v3.4h, #0
429+ ; CHECK-NODOT-NEXT: ushll v2.4s, v2.4h, #0
430+ ; CHECK-NODOT-NEXT: umlal2 v1.2d, v7.4s, v6.4s
431+ ; CHECK-NODOT-NEXT: umlal v0.2d, v7.2s, v6.2s
432+ ; CHECK-NODOT-NEXT: umull2 v18.2d, v5.4s, v4.4s
433+ ; CHECK-NODOT-NEXT: umull v4.2d, v5.2s, v4.2s
434+ ; CHECK-NODOT-NEXT: umlal2 v1.2d, v17.4s, v16.4s
435+ ; CHECK-NODOT-NEXT: umlal v0.2d, v17.2s, v16.2s
436+ ; CHECK-NODOT-NEXT: umlal2 v18.2d, v2.4s, v3.4s
437+ ; CHECK-NODOT-NEXT: umlal v4.2d, v2.2s, v3.2s
438+ ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
423439; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
424440; CHECK-NODOT-NEXT: ret
425441entry:
@@ -442,19 +458,27 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
442458;
443459; CHECK-NODOT-LABEL: sdot_8to64:
444460; CHECK-NODOT: // %bb.0: // %entry
445- ; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b
446- ; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b
447- ; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0
448- ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
461+ ; CHECK-NODOT-NEXT: sshll v4.8h, v3.8b, #0
462+ ; CHECK-NODOT-NEXT: sshll v5.8h, v2.8b, #0
463+ ; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0
464+ ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
465+ ; CHECK-NODOT-NEXT: sshll v6.4s, v4.4h, #0
466+ ; CHECK-NODOT-NEXT: sshll v7.4s, v5.4h, #0
449467; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0
450- ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
451- ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s
452- ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
453- ; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s
454- ; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s
455- ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
456- ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
457- ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
468+ ; CHECK-NODOT-NEXT: sshll2 v5.4s, v5.8h, #0
469+ ; CHECK-NODOT-NEXT: sshll2 v16.4s, v3.8h, #0
470+ ; CHECK-NODOT-NEXT: sshll2 v17.4s, v2.8h, #0
471+ ; CHECK-NODOT-NEXT: sshll v3.4s, v3.4h, #0
472+ ; CHECK-NODOT-NEXT: sshll v2.4s, v2.4h, #0
473+ ; CHECK-NODOT-NEXT: smlal2 v1.2d, v7.4s, v6.4s
474+ ; CHECK-NODOT-NEXT: smlal v0.2d, v7.2s, v6.2s
475+ ; CHECK-NODOT-NEXT: smull2 v18.2d, v5.4s, v4.4s
476+ ; CHECK-NODOT-NEXT: smull v4.2d, v5.2s, v4.2s
477+ ; CHECK-NODOT-NEXT: smlal2 v1.2d, v17.4s, v16.4s
478+ ; CHECK-NODOT-NEXT: smlal v0.2d, v17.2s, v16.2s
479+ ; CHECK-NODOT-NEXT: smlal2 v18.2d, v2.4s, v3.4s
480+ ; CHECK-NODOT-NEXT: smlal v4.2d, v2.2s, v3.2s
481+ ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
458482; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
459483; CHECK-NODOT-NEXT: ret
460484entry:
@@ -771,9 +795,10 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
771795define <4 x i32 > @not_udot (<4 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
772796; CHECK-LABEL: not_udot:
773797; CHECK: // %bb.0:
774- ; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b
775- ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
776- ; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
798+ ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
799+ ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
800+ ; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
801+ ; CHECK-NEXT: umlal2 v0.4s, v2.8h, v1.8h
777802; CHECK-NEXT: ret
778803 %u.wide = zext <8 x i8 > %u to <8 x i32 >
779804 %s.wide = zext <8 x i8 > %s to <8 x i32 >
0 commit comments