@@ -108,22 +108,21 @@ define void @matrix_mul_signed(i32 %N, ptr nocapture %C, ptr nocapture readonly
108108;
109109; CHECK-GI-LABEL: matrix_mul_signed:
110110; CHECK-GI: // %bb.0: // %vector.header
111- ; CHECK-GI-NEXT: sxth w9 , w3
111+ ; CHECK-GI-NEXT: sxth w8 , w3
112112; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
113+ ; CHECK-GI-NEXT: dup v0.4s, w8
113114; CHECK-GI-NEXT: sxtw x8, w0
114- ; CHECK-GI-NEXT: dup v0.4s, w9
115115; CHECK-GI-NEXT: and x8, x8, #0xfffffff8
116+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
116117; CHECK-GI-NEXT: .LBB1_1: // %vector.body
117118; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
118119; CHECK-GI-NEXT: add x9, x2, w0, sxtw #1
119120; CHECK-GI-NEXT: subs x8, x8, #8
120121; CHECK-GI-NEXT: ldp d1, d2, [x9]
121122; CHECK-GI-NEXT: add x9, x1, w0, sxtw #2
122123; CHECK-GI-NEXT: add w0, w0, #8
123- ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
124- ; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
125- ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
126- ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
124+ ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
125+ ; CHECK-GI-NEXT: smull v2.4s, v0.4h, v2.4h
127126; CHECK-GI-NEXT: stp q1, q2, [x9]
128127; CHECK-GI-NEXT: b.ne .LBB1_1
129128; CHECK-GI-NEXT: // %bb.2: // %for.end12
@@ -305,40 +304,39 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
305304; CHECK-GI-NEXT: b.le .LBB3_7
306305; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader
307306; CHECK-GI-NEXT: sxth w8, w1
308- ; CHECK-GI-NEXT: mov x9 , xzr
307+ ; CHECK-GI-NEXT: mov x10 , xzr
309308; CHECK-GI-NEXT: cmp w3, #16
310- ; CHECK-GI-NEXT: mov w10 , w3
309+ ; CHECK-GI-NEXT: mov w9 , w3
311310; CHECK-GI-NEXT: b.lo .LBB3_5
312311; CHECK-GI-NEXT: // %bb.2: // %vector.ph
313312; CHECK-GI-NEXT: dup v0.4s, w8
314- ; CHECK-GI-NEXT: and x9, x10 , #0xfffffff0
313+ ; CHECK-GI-NEXT: and x10, x9 , #0xfffffff0
315314; CHECK-GI-NEXT: add x11, x2, #32
316315; CHECK-GI-NEXT: add x12, x0, #16
317- ; CHECK-GI-NEXT: mov x13, x9
316+ ; CHECK-GI-NEXT: mov x13, x10
317+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
318318; CHECK-GI-NEXT: .LBB3_3: // %vector.body
319319; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
320320; CHECK-GI-NEXT: ldp q1, q2, [x12, #-16]
321321; CHECK-GI-NEXT: mov x14, x11
322322; CHECK-GI-NEXT: subs x13, x13, #16
323323; CHECK-GI-NEXT: add x12, x12, #32
324- ; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0
325- ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
326- ; CHECK-GI-NEXT: sshll v4.4s, v2.4h, #0
327- ; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0
328- ; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s
329- ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
330- ; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s
331- ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
332- ; CHECK-GI-NEXT: stp q3, q1, [x14, #-32]!
333- ; CHECK-GI-NEXT: stp q4, q2, [x11], #64
324+ ; CHECK-GI-NEXT: mov d3, v1.d[1]
325+ ; CHECK-GI-NEXT: mov d4, v2.d[1]
326+ ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
327+ ; CHECK-GI-NEXT: smull v2.4s, v0.4h, v2.4h
328+ ; CHECK-GI-NEXT: smull v3.4s, v0.4h, v3.4h
329+ ; CHECK-GI-NEXT: smull v4.4s, v0.4h, v4.4h
330+ ; CHECK-GI-NEXT: stp q1, q3, [x14, #-32]!
331+ ; CHECK-GI-NEXT: stp q2, q4, [x11], #64
334332; CHECK-GI-NEXT: b.ne .LBB3_3
335333; CHECK-GI-NEXT: // %bb.4: // %middle.block
336- ; CHECK-GI-NEXT: cmp x9, x10
334+ ; CHECK-GI-NEXT: cmp x10, x9
337335; CHECK-GI-NEXT: b.eq .LBB3_7
338336; CHECK-GI-NEXT: .LBB3_5: // %for.body.preheader1
339- ; CHECK-GI-NEXT: add x11, x2, x9 , lsl #2
340- ; CHECK-GI-NEXT: add x12, x0, x9 , lsl #1
341- ; CHECK-GI-NEXT: sub x9, x10, x9
337+ ; CHECK-GI-NEXT: add x11, x2, x10 , lsl #2
338+ ; CHECK-GI-NEXT: add x12, x0, x10 , lsl #1
339+ ; CHECK-GI-NEXT: sub x9, x9, x10
342340; CHECK-GI-NEXT: .LBB3_6: // %for.body
343341; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
344342; CHECK-GI-NEXT: ldrsh w10, [x12], #2
@@ -834,30 +832,18 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
834832; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
835833; CHECK-GI-NEXT: mov x8, xzr
836834; CHECK-GI-NEXT: dup v0.2d, v0.d[1]
837- ; CHECK-GI-NEXT: mov x9, v0.d[1]
838- ; CHECK-GI-NEXT: fmov x10, d0
835+ ; CHECK-GI-NEXT: xtn v0.2s, v0.2d
839836; CHECK-GI-NEXT: .LBB7_1: // %loop
840837; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
841- ; CHECK-GI-NEXT: ldr q0 , [x0]
838+ ; CHECK-GI-NEXT: ldr q1 , [x0]
842839; CHECK-GI-NEXT: subs x2, x2, #8
843840; CHECK-GI-NEXT: add x8, x8, #8
844- ; CHECK-GI-NEXT: sshll v1.2d, v0.2s, #0
845- ; CHECK-GI-NEXT: sshll2 v0.2d, v0.4s, #0
846- ; CHECK-GI-NEXT: fmov x11, d1
847- ; CHECK-GI-NEXT: mov x12, v1.d[1]
848- ; CHECK-GI-NEXT: fmov x13, d0
849- ; CHECK-GI-NEXT: mov x14, v0.d[1]
850- ; CHECK-GI-NEXT: mul x11, x11, x10
851- ; CHECK-GI-NEXT: mul x13, x13, x10
852- ; CHECK-GI-NEXT: mul x12, x12, x9
853- ; CHECK-GI-NEXT: mov v0.d[0], x11
854- ; CHECK-GI-NEXT: mul x11, x14, x9
855- ; CHECK-GI-NEXT: mov v1.d[0], x13
856- ; CHECK-GI-NEXT: mov v0.d[1], x12
857- ; CHECK-GI-NEXT: mov v1.d[1], x11
858- ; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #15
859- ; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #15
860- ; CHECK-GI-NEXT: str q0, [x0], #32
841+ ; CHECK-GI-NEXT: mov d2, v1.d[1]
842+ ; CHECK-GI-NEXT: smull v1.2d, v1.2s, v0.2s
843+ ; CHECK-GI-NEXT: smull v2.2d, v2.2s, v0.2s
844+ ; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15
845+ ; CHECK-GI-NEXT: shrn2 v1.4s, v2.2d, #15
846+ ; CHECK-GI-NEXT: str q1, [x0], #32
861847; CHECK-GI-NEXT: b.ne .LBB7_1
862848; CHECK-GI-NEXT: // %bb.2: // %exit
863849; CHECK-GI-NEXT: ret
@@ -971,18 +957,19 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
971957; CHECK-GI: // %bb.0: // %entry
972958; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0
973959; CHECK-GI-NEXT: mov x8, xzr
960+ ; CHECK-GI-NEXT: dup v0.8h, v0.h[2]
961+ ; CHECK-GI-NEXT: xtn v0.8b, v0.8h
974962; CHECK-GI-NEXT: .LBB9_1: // %loop
975963; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
976964; CHECK-GI-NEXT: ldr q1, [x0]
977965; CHECK-GI-NEXT: subs x2, x2, #8
978966; CHECK-GI-NEXT: add x8, x8, #8
979- ; CHECK-GI-NEXT: sshll v2.8h, v1.8b, #0
980- ; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0
981- ; CHECK-GI-NEXT: mul v2.8h, v2.8h, v0.h[2]
982- ; CHECK-GI-NEXT: mul v1.8h, v1.8h, v0.h[2]
983- ; CHECK-GI-NEXT: sshr v2.8h, v2.8h, #15
967+ ; CHECK-GI-NEXT: mov d2, v1.d[1]
968+ ; CHECK-GI-NEXT: smull v1.8h, v1.8b, v0.8b
969+ ; CHECK-GI-NEXT: smull v2.8h, v2.8b, v0.8b
984970; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #15
985- ; CHECK-GI-NEXT: uzp1 v1.16b, v2.16b, v1.16b
971+ ; CHECK-GI-NEXT: sshr v2.8h, v2.8h, #15
972+ ; CHECK-GI-NEXT: uzp1 v1.16b, v1.16b, v2.16b
986973; CHECK-GI-NEXT: str q1, [x0], #32
987974; CHECK-GI-NEXT: b.ne .LBB9_1
988975; CHECK-GI-NEXT: // %bb.2: // %exit
0 commit comments