@@ -28,17 +28,16 @@ define void @matrix_mul_unsigned(i32 %N, ptr nocapture %C, ptr nocapture readonl
2828; CHECK-GI-NEXT: dup v0.4s, w8
2929; CHECK-GI-NEXT: mov w8, w0
3030; CHECK-GI-NEXT: and x8, x8, #0xfffffff8
31+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
3132; CHECK-GI-NEXT: .LBB0_1: // %vector.body
3233; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
3334; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
3435; CHECK-GI-NEXT: subs x8, x8, #8
3536; CHECK-GI-NEXT: ldp d1, d2, [x9]
3637; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2
3738; CHECK-GI-NEXT: add w0, w0, #8
38- ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
39- ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
40- ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
41- ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
39+ ; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h
40+ ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h
4241; CHECK-GI-NEXT: stp q1, q2, [x9]
4342; CHECK-GI-NEXT: b.ne .LBB0_1
4443; CHECK-GI-NEXT: // %bb.2: // %for.end12
@@ -478,22 +477,21 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
478477; CHECK-GI-NEXT: mov x12, x8
479478; CHECK-GI-NEXT: .LBB4_3: // %vector.body
480479; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
481- ; CHECK-GI-NEXT: ldp q0, q1, [x11, #-16]
482480; CHECK-GI-NEXT: and w13, w1, #0xffff
483- ; CHECK-GI-NEXT: dup v2.4s, w13
481+ ; CHECK-GI-NEXT: ldp q1, q2, [x11, #-16]
482+ ; CHECK-GI-NEXT: dup v0.4s, w13
484483; CHECK-GI-NEXT: mov x13, x10
485484; CHECK-GI-NEXT: subs x12, x12, #16
486485; CHECK-GI-NEXT: add x11, x11, #32
487- ; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0
488- ; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0
489- ; CHECK-GI-NEXT: ushll v4.4s, v1.4h, #0
490- ; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
491- ; CHECK-GI-NEXT: mul v3.4s, v2.4s, v3.4s
492- ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s
493- ; CHECK-GI-NEXT: mul v4.4s, v2.4s, v4.4s
494- ; CHECK-GI-NEXT: mul v1.4s, v2.4s, v1.4s
495- ; CHECK-GI-NEXT: stp q3, q0, [x13, #-32]!
496- ; CHECK-GI-NEXT: stp q4, q1, [x10], #64
486+ ; CHECK-GI-NEXT: mov d3, v1.d[1]
487+ ; CHECK-GI-NEXT: mov d4, v2.d[1]
488+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
489+ ; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h
490+ ; CHECK-GI-NEXT: umull v3.4s, v0.4h, v3.4h
491+ ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h
492+ ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v4.4h
493+ ; CHECK-GI-NEXT: stp q1, q3, [x13, #-32]!
494+ ; CHECK-GI-NEXT: stp q2, q0, [x10], #64
497495; CHECK-GI-NEXT: b.ne .LBB4_3
498496; CHECK-GI-NEXT: // %bb.4: // %middle.block
499497; CHECK-GI-NEXT: cmp x8, x9
@@ -775,22 +773,15 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
775773; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
776774; CHECK-GI-NEXT: mov x8, xzr
777775; CHECK-GI-NEXT: dup v0.2d, v0.d[1]
778- ; CHECK-GI-NEXT: mov x9, v0.d[1]
779- ; CHECK-GI-NEXT: fmov x10, d0
776+ ; CHECK-GI-NEXT: xtn v0.2s, v0.2d
780777; CHECK-GI-NEXT: .LBB6_1: // %loop
781778; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
782- ; CHECK-GI-NEXT: ldr d0 , [x0]
779+ ; CHECK-GI-NEXT: ldr d1 , [x0]
783780; CHECK-GI-NEXT: subs x2, x2, #8
784781; CHECK-GI-NEXT: add x8, x8, #8
785- ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
786- ; CHECK-GI-NEXT: fmov x11, d0
787- ; CHECK-GI-NEXT: mov x12, v0.d[1]
788- ; CHECK-GI-NEXT: mul x11, x11, x10
789- ; CHECK-GI-NEXT: mul x12, x12, x9
790- ; CHECK-GI-NEXT: mov v0.d[0], x11
791- ; CHECK-GI-NEXT: mov v0.d[1], x12
792- ; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #15
793- ; CHECK-GI-NEXT: str d0, [x0], #32
782+ ; CHECK-GI-NEXT: umull v1.2d, v1.2s, v0.2s
783+ ; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15
784+ ; CHECK-GI-NEXT: str d1, [x0], #32
794785; CHECK-GI-NEXT: b.ne .LBB6_1
795786; CHECK-GI-NEXT: // %bb.2: // %exit
796787; CHECK-GI-NEXT: ret
@@ -917,13 +908,14 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
917908; CHECK-GI: // %bb.0: // %entry
918909; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
919910; CHECK-GI-NEXT: mov x8, xzr
911+ ; CHECK-GI-NEXT: dup v0.8h, v0.h[0]
912+ ; CHECK-GI-NEXT: xtn v0.8b, v0.8h
920913; CHECK-GI-NEXT: .LBB8_1: // %loop
921914; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
922915; CHECK-GI-NEXT: ldr d1, [x0]
923916; CHECK-GI-NEXT: subs x2, x2, #8
924917; CHECK-GI-NEXT: add x8, x8, #8
925- ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
926- ; CHECK-GI-NEXT: mul v1.8h, v1.8h, v0.h[0]
918+ ; CHECK-GI-NEXT: umull v1.8h, v1.8b, v0.8b
927919; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #15
928920; CHECK-GI-NEXT: xtn v1.8b, v1.8h
929921; CHECK-GI-NEXT: str d1, [x0], #32
@@ -1046,17 +1038,16 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
10461038; CHECK-GI-NEXT: dup v0.4s, w8
10471039; CHECK-GI-NEXT: mov w8, w0
10481040; CHECK-GI-NEXT: and x8, x8, #0xfffffff8
1041+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
10491042; CHECK-GI-NEXT: .LBB10_1: // %vector.body
10501043; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
10511044; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
10521045; CHECK-GI-NEXT: subs x8, x8, #8
10531046; CHECK-GI-NEXT: ldp d1, d2, [x9]
10541047; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2
10551048; CHECK-GI-NEXT: add w0, w0, #8
1056- ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
1057- ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
1058- ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
1059- ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
1049+ ; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h
1050+ ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h
10601051; CHECK-GI-NEXT: stp q1, q2, [x9]
10611052; CHECK-GI-NEXT: b.ne .LBB10_1
10621053; CHECK-GI-NEXT: // %bb.2: // %for.end12
@@ -1135,6 +1126,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
11351126; CHECK-GI-NEXT: dup v0.4s, w8
11361127; CHECK-GI-NEXT: mov w8, w0
11371128; CHECK-GI-NEXT: and x8, x8, #0xfffffff0
1129+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
11381130; CHECK-GI-NEXT: .LBB11_1: // %vector.body
11391131; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
11401132; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
@@ -1143,16 +1135,14 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
11431135; CHECK-GI-NEXT: ldur q2, [x9, #8]
11441136; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2
11451137; CHECK-GI-NEXT: add w0, w0, #16
1146- ; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0
1147- ; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
1148- ; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0
1149- ; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0
1150- ; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s
1151- ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
1152- ; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s
1153- ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
1154- ; CHECK-GI-NEXT: stp q3, q1, [x9]
1155- ; CHECK-GI-NEXT: stp q4, q2, [x9, #32]!
1138+ ; CHECK-GI-NEXT: mov d3, v1.d[1]
1139+ ; CHECK-GI-NEXT: mov d4, v2.d[1]
1140+ ; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h
1141+ ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h
1142+ ; CHECK-GI-NEXT: umull v3.4s, v0.4h, v3.4h
1143+ ; CHECK-GI-NEXT: umull v4.4s, v0.4h, v4.4h
1144+ ; CHECK-GI-NEXT: stp q1, q3, [x9]
1145+ ; CHECK-GI-NEXT: stp q2, q4, [x9, #32]!
11561146; CHECK-GI-NEXT: b.ne .LBB11_1
11571147; CHECK-GI-NEXT: // %bb.2: // %for.end12
11581148; CHECK-GI-NEXT: ret
0 commit comments