@@ -891,70 +891,73 @@ entry:
891891define <4 x i32 > @usdot_multiple_zext_users (ptr %p1 , ptr %p2 , ptr %p3 ) {
892892; CHECK-LABEL: usdot_multiple_zext_users:
893893; CHECK: // %bb.0: // %entry
894- ; CHECK-NEXT: adrp x9, .LCPI28_0
895- ; CHECK-NEXT: adrp x10, .LCPI28_3
896- ; CHECK-NEXT: ldr q0, [x2]
897- ; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI28_0]
898- ; CHECK-NEXT: adrp x9, .LCPI28_1
899- ; CHECK-NEXT: ldr q4, [x10, :lo12:.LCPI28_3]
900- ; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI28_1]
894+ ; CHECK-NEXT: adrp x8, .LCPI28_0
895+ ; CHECK-NEXT: movi v0.2d, #0000000000000000
896+ ; CHECK-NEXT: movi v2.2d, #0000000000000000
897+ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI28_0]
898+ ; CHECK-NEXT: adrp x8, .LCPI28_1
901899; CHECK-NEXT: adrp x9, .LCPI28_2
902- ; CHECK-NEXT: ldr q5, [x1]
903- ; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI28_2]
904- ; CHECK-NEXT: tbl v1.16b, { v0.16b }, v1.16b
900+ ; CHECK-NEXT: adrp x10, .LCPI28_3
901+ ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI28_1]
902+ ; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI28_2]
903+ ; CHECK-NEXT: ldr q5, [x10, :lo12:.LCPI28_3]
905904; CHECK-NEXT: mov x8, xzr
906- ; CHECK-NEXT: tbl v2.16b, { v0.16b }, v2.16b
907- ; CHECK-NEXT: mov w9, #1024 // =0x400
908- ; CHECK-NEXT: tbl v3.16b, { v0.16b }, v3.16b
909- ; CHECK-NEXT: tbl v0.16b, { v0.16b }, v4.16b
910- ; CHECK-NEXT: ldr q4, [x0]
911- ; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h
912- ; CHECK-NEXT: sshll v2.8h, v4.8b, #0
913- ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h
914- ; CHECK-NEXT: sshll2 v3.8h, v4.16b, #0
915- ; CHECK-NEXT: sshll v4.8h, v5.8b, #0
916- ; CHECK-NEXT: sshll2 v5.8h, v5.16b, #0
917- ; CHECK-NEXT: smull v6.4s, v2.4h, v1.4h
918- ; CHECK-NEXT: smull v17.4s, v4.4h, v1.4h
919- ; CHECK-NEXT: smull v7.4s, v3.4h, v0.4h
920- ; CHECK-NEXT: smull v16.4s, v5.4h, v0.4h
921- ; CHECK-NEXT: smlal2 v6.4s, v3.8h, v0.8h
922- ; CHECK-NEXT: smlal2 v17.4s, v5.8h, v0.8h
923- ; CHECK-NEXT: smlal2 v7.4s, v2.8h, v1.8h
924- ; CHECK-NEXT: smlal2 v16.4s, v4.8h, v1.8h
925- ; CHECK-NEXT: add v0.4s, v7.4s, v6.4s
926- ; CHECK-NEXT: add v1.4s, v16.4s, v17.4s
927905; CHECK-NEXT: .LBB28_1: // %vector.body
928906; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
929- ; CHECK-NEXT: subs x9, x9, #16
907+ ; CHECK-NEXT: ldr q6, [x2, x8]
908+ ; CHECK-NEXT: ldr q18, [x0, x8]
909+ ; CHECK-NEXT: ldr q19, [x1, x8]
930910; CHECK-NEXT: add x8, x8, #16
911+ ; CHECK-NEXT: tbl v7.16b, { v6.16b }, v1.16b
912+ ; CHECK-NEXT: tbl v16.16b, { v6.16b }, v3.16b
913+ ; CHECK-NEXT: tbl v17.16b, { v6.16b }, v4.16b
914+ ; CHECK-NEXT: tbl v6.16b, { v6.16b }, v5.16b
915+ ; CHECK-NEXT: cmp x8, #1024
916+ ; CHECK-NEXT: uzp1 v7.8h, v16.8h, v7.8h
917+ ; CHECK-NEXT: sshll v16.8h, v18.8b, #0
918+ ; CHECK-NEXT: uzp1 v6.8h, v6.8h, v17.8h
919+ ; CHECK-NEXT: sshll2 v17.8h, v18.16b, #0
920+ ; CHECK-NEXT: sshll v18.8h, v19.8b, #0
921+ ; CHECK-NEXT: sshll2 v19.8h, v19.16b, #0
922+ ; CHECK-NEXT: smlal v0.4s, v16.4h, v7.4h
923+ ; CHECK-NEXT: smlal v2.4s, v18.4h, v7.4h
924+ ; CHECK-NEXT: smull v20.4s, v17.4h, v6.4h
925+ ; CHECK-NEXT: smull v21.4s, v19.4h, v6.4h
926+ ; CHECK-NEXT: smlal2 v0.4s, v17.8h, v6.8h
927+ ; CHECK-NEXT: smlal2 v2.4s, v19.8h, v6.8h
928+ ; CHECK-NEXT: smlal2 v20.4s, v16.8h, v7.8h
929+ ; CHECK-NEXT: smlal2 v21.4s, v18.8h, v7.8h
930+ ; CHECK-NEXT: add v0.4s, v20.4s, v0.4s
931+ ; CHECK-NEXT: add v2.4s, v21.4s, v2.4s
931932; CHECK-NEXT: b.ne .LBB28_1
932933; CHECK-NEXT: // %bb.2: // %end
933- ; CHECK-NEXT: add v0.4s, v1 .4s, v0.4s
934+ ; CHECK-NEXT: add v0.4s, v2 .4s, v0.4s
934935; CHECK-NEXT: ret
935936entry:
936937 br label %vector.body
937938
938939vector.body:
939940 %iv = phi i64 [ 0 , %entry ], [ %iv.next , %vector.body ]
941+ %acc1 = phi <4 x i32 > [ zeroinitializer , %entry ], [ %psum1 , %vector.body ]
942+ %acc2 = phi <4 x i32 > [ zeroinitializer , %entry ], [ %psum2 , %vector.body ]
940943 %ptr1 = getelementptr i8 , ptr %p1 , i64 %iv
941944 %ptr2 = getelementptr i8 , ptr %p2 , i64 %iv
942945 %ptr3 = getelementptr i8 , ptr %p3 , i64 %iv
943- %load1 = load <16 x i8 >, ptr %p1 , align 1
944- %load2 = load <16 x i8 >, ptr %p2 , align 1
945- %load3 = load <16 x i8 >, ptr %p3 , align 1
946- %1 = sext <16 x i8 > %load1 to <16 x i32 >
947- %2 = zext <16 x i8 > %load3 to <16 x i32 >
948- %3 = mul <16 x i32 > %1 , %2
949- %psum1 = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > zeroinitializer , <16 x i32 > %3 )
950- %4 = sext <16 x i8 > %load2 to <16 x i32 >
951- %5 = mul <16 x i32 > %4 , %2
952- %psum2 = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > zeroinitializer , <16 x i32 > %5 )
946+ %load1 = load <16 x i8 >, ptr %ptr1
947+ %load2 = load <16 x i8 >, ptr %ptr2
948+ %load3 = load <16 x i8 >, ptr %ptr3
949+ %sext1 = sext <16 x i8 > %load1 to <16 x i32 >
950+ %zext = zext <16 x i8 > %load3 to <16 x i32 >
951+ %mul1 = mul <16 x i32 > %sext1 , %zext
952+ %psum1 = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc1 , <16 x i32 > %mul1 )
953+ %sext2 = sext <16 x i8 > %load2 to <16 x i32 >
954+ %mul2 = mul <16 x i32 > %sext2 , %zext
955+ %psum2 = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc2 , <16 x i32 > %mul2 )
953956 %iv.next = add i64 %iv , 16
954- %6 = icmp eq i64 %iv.next , 1024
955- br i1 %6 , label %end , label %vector.body
957+ %1 = icmp eq i64 %iv.next , 1024
958+ br i1 %1 , label %end , label %vector.body
956959
957960end:
958- %7 = add <4 x i32 > %psum2 , %psum1
959- ret <4 x i32 > %7
961+ %2 = add <4 x i32 > %psum2 , %psum1
962+ ret <4 x i32 > %2
960963}
0 commit comments