@@ -887,3 +887,74 @@ entry:
887887 %partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64 (<2 x i64 > %acc , <8 x i64 > %mult )
888888 ret <2 x i64 > %partial.reduce
889889}
890+
891+ define <4 x i32 > @usdot_multiple_zext_users (ptr %p1 , ptr %p2 , ptr %p3 ) {
892+ ; CHECK-LABEL: usdot_multiple_zext_users:
893+ ; CHECK: // %bb.0: // %entry
894+ ; CHECK-NEXT: adrp x9, .LCPI28_0
895+ ; CHECK-NEXT: adrp x10, .LCPI28_3
896+ ; CHECK-NEXT: ldr q0, [x2]
897+ ; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI28_0]
898+ ; CHECK-NEXT: adrp x9, .LCPI28_1
899+ ; CHECK-NEXT: ldr q4, [x10, :lo12:.LCPI28_3]
900+ ; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI28_1]
901+ ; CHECK-NEXT: adrp x9, .LCPI28_2
902+ ; CHECK-NEXT: ldr q5, [x1]
903+ ; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI28_2]
904+ ; CHECK-NEXT: tbl v1.16b, { v0.16b }, v1.16b
905+ ; CHECK-NEXT: mov x8, xzr
906+ ; CHECK-NEXT: tbl v2.16b, { v0.16b }, v2.16b
907+ ; CHECK-NEXT: mov w9, #1024 // =0x400
908+ ; CHECK-NEXT: tbl v3.16b, { v0.16b }, v3.16b
909+ ; CHECK-NEXT: tbl v0.16b, { v0.16b }, v4.16b
910+ ; CHECK-NEXT: ldr q4, [x0]
911+ ; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h
912+ ; CHECK-NEXT: sshll v2.8h, v4.8b, #0
913+ ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h
914+ ; CHECK-NEXT: sshll2 v3.8h, v4.16b, #0
915+ ; CHECK-NEXT: sshll v4.8h, v5.8b, #0
916+ ; CHECK-NEXT: sshll2 v5.8h, v5.16b, #0
917+ ; CHECK-NEXT: smull v6.4s, v2.4h, v1.4h
918+ ; CHECK-NEXT: smull v17.4s, v4.4h, v1.4h
919+ ; CHECK-NEXT: smull v7.4s, v3.4h, v0.4h
920+ ; CHECK-NEXT: smull v16.4s, v5.4h, v0.4h
921+ ; CHECK-NEXT: smlal2 v6.4s, v3.8h, v0.8h
922+ ; CHECK-NEXT: smlal2 v17.4s, v5.8h, v0.8h
923+ ; CHECK-NEXT: smlal2 v7.4s, v2.8h, v1.8h
924+ ; CHECK-NEXT: smlal2 v16.4s, v4.8h, v1.8h
925+ ; CHECK-NEXT: add v0.4s, v7.4s, v6.4s
926+ ; CHECK-NEXT: add v1.4s, v16.4s, v17.4s
927+ ; CHECK-NEXT: .LBB28_1: // %vector.body
928+ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
929+ ; CHECK-NEXT: subs x9, x9, #16
930+ ; CHECK-NEXT: add x8, x8, #16
931+ ; CHECK-NEXT: b.ne .LBB28_1
932+ ; CHECK-NEXT: // %bb.2: // %end
933+ ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
934+ ; CHECK-NEXT: ret
935+ entry:
936+ br label %vector.body
937+
938+ vector.body:
939+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %vector.body ]
940+ %ptr1 = getelementptr i8 , ptr %p1 , i64 %iv
941+ %ptr2 = getelementptr i8 , ptr %p2 , i64 %iv
942+ %ptr3 = getelementptr i8 , ptr %p3 , i64 %iv
943+ %load1 = load <16 x i8 >, ptr %p1 , align 1
944+ %load2 = load <16 x i8 >, ptr %p2 , align 1
945+ %load3 = load <16 x i8 >, ptr %p3 , align 1
946+ %1 = sext <16 x i8 > %load1 to <16 x i32 >
947+ %2 = zext <16 x i8 > %load3 to <16 x i32 >
948+ %3 = mul <16 x i32 > %1 , %2
949+ %psum1 = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > zeroinitializer , <16 x i32 > %3 )
950+ %4 = sext <16 x i8 > %load2 to <16 x i32 >
951+ %5 = mul <16 x i32 > %4 , %2
952+ %psum2 = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > zeroinitializer , <16 x i32 > %5 )
953+ %iv.next = add i64 %iv , 16
954+ %6 = icmp eq i64 %iv.next , 1024
955+ br i1 %6 , label %end , label %vector.body
956+
957+ end:
958+ %7 = add <4 x i32 > %psum2 , %psum1
959+ ret <4 x i32 > %7
960+ }
0 commit comments