Skip to content

Commit 189d04d

Browse files
committed
Address comments
1 parent 32a41e9 commit 189d04d

File tree

1 file changed

+50
-47
lines changed

1 file changed

+50
-47
lines changed

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll

Lines changed: 50 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -891,70 +891,73 @@ entry:
891891
define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) {
892892
; CHECK-LABEL: usdot_multiple_zext_users:
893893
; CHECK: // %bb.0: // %entry
894-
; CHECK-NEXT: adrp x9, .LCPI28_0
895-
; CHECK-NEXT: adrp x10, .LCPI28_3
896-
; CHECK-NEXT: ldr q0, [x2]
897-
; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI28_0]
898-
; CHECK-NEXT: adrp x9, .LCPI28_1
899-
; CHECK-NEXT: ldr q4, [x10, :lo12:.LCPI28_3]
900-
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI28_1]
894+
; CHECK-NEXT: adrp x8, .LCPI28_0
895+
; CHECK-NEXT: movi v0.2d, #0000000000000000
896+
; CHECK-NEXT: movi v2.2d, #0000000000000000
897+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI28_0]
898+
; CHECK-NEXT: adrp x8, .LCPI28_1
901899
; CHECK-NEXT: adrp x9, .LCPI28_2
902-
; CHECK-NEXT: ldr q5, [x1]
903-
; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI28_2]
904-
; CHECK-NEXT: tbl v1.16b, { v0.16b }, v1.16b
900+
; CHECK-NEXT: adrp x10, .LCPI28_3
901+
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI28_1]
902+
; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI28_2]
903+
; CHECK-NEXT: ldr q5, [x10, :lo12:.LCPI28_3]
905904
; CHECK-NEXT: mov x8, xzr
906-
; CHECK-NEXT: tbl v2.16b, { v0.16b }, v2.16b
907-
; CHECK-NEXT: mov w9, #1024 // =0x400
908-
; CHECK-NEXT: tbl v3.16b, { v0.16b }, v3.16b
909-
; CHECK-NEXT: tbl v0.16b, { v0.16b }, v4.16b
910-
; CHECK-NEXT: ldr q4, [x0]
911-
; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h
912-
; CHECK-NEXT: sshll v2.8h, v4.8b, #0
913-
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h
914-
; CHECK-NEXT: sshll2 v3.8h, v4.16b, #0
915-
; CHECK-NEXT: sshll v4.8h, v5.8b, #0
916-
; CHECK-NEXT: sshll2 v5.8h, v5.16b, #0
917-
; CHECK-NEXT: smull v6.4s, v2.4h, v1.4h
918-
; CHECK-NEXT: smull v17.4s, v4.4h, v1.4h
919-
; CHECK-NEXT: smull v7.4s, v3.4h, v0.4h
920-
; CHECK-NEXT: smull v16.4s, v5.4h, v0.4h
921-
; CHECK-NEXT: smlal2 v6.4s, v3.8h, v0.8h
922-
; CHECK-NEXT: smlal2 v17.4s, v5.8h, v0.8h
923-
; CHECK-NEXT: smlal2 v7.4s, v2.8h, v1.8h
924-
; CHECK-NEXT: smlal2 v16.4s, v4.8h, v1.8h
925-
; CHECK-NEXT: add v0.4s, v7.4s, v6.4s
926-
; CHECK-NEXT: add v1.4s, v16.4s, v17.4s
927905
; CHECK-NEXT: .LBB28_1: // %vector.body
928906
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
929-
; CHECK-NEXT: subs x9, x9, #16
907+
; CHECK-NEXT: ldr q6, [x2, x8]
908+
; CHECK-NEXT: ldr q18, [x0, x8]
909+
; CHECK-NEXT: ldr q19, [x1, x8]
930910
; CHECK-NEXT: add x8, x8, #16
911+
; CHECK-NEXT: tbl v7.16b, { v6.16b }, v1.16b
912+
; CHECK-NEXT: tbl v16.16b, { v6.16b }, v3.16b
913+
; CHECK-NEXT: tbl v17.16b, { v6.16b }, v4.16b
914+
; CHECK-NEXT: tbl v6.16b, { v6.16b }, v5.16b
915+
; CHECK-NEXT: cmp x8, #1024
916+
; CHECK-NEXT: uzp1 v7.8h, v16.8h, v7.8h
917+
; CHECK-NEXT: sshll v16.8h, v18.8b, #0
918+
; CHECK-NEXT: uzp1 v6.8h, v6.8h, v17.8h
919+
; CHECK-NEXT: sshll2 v17.8h, v18.16b, #0
920+
; CHECK-NEXT: sshll v18.8h, v19.8b, #0
921+
; CHECK-NEXT: sshll2 v19.8h, v19.16b, #0
922+
; CHECK-NEXT: smlal v0.4s, v16.4h, v7.4h
923+
; CHECK-NEXT: smlal v2.4s, v18.4h, v7.4h
924+
; CHECK-NEXT: smull v20.4s, v17.4h, v6.4h
925+
; CHECK-NEXT: smull v21.4s, v19.4h, v6.4h
926+
; CHECK-NEXT: smlal2 v0.4s, v17.8h, v6.8h
927+
; CHECK-NEXT: smlal2 v2.4s, v19.8h, v6.8h
928+
; CHECK-NEXT: smlal2 v20.4s, v16.8h, v7.8h
929+
; CHECK-NEXT: smlal2 v21.4s, v18.8h, v7.8h
930+
; CHECK-NEXT: add v0.4s, v20.4s, v0.4s
931+
; CHECK-NEXT: add v2.4s, v21.4s, v2.4s
931932
; CHECK-NEXT: b.ne .LBB28_1
932933
; CHECK-NEXT: // %bb.2: // %end
933-
; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
934+
; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
934935
; CHECK-NEXT: ret
935936
entry:
936937
br label %vector.body
937938

938939
vector.body:
939940
%iv = phi i64 [ 0, %entry ], [ %iv.next, %vector.body ]
941+
%acc1 = phi <4 x i32> [ zeroinitializer, %entry], [ %psum1, %vector.body]
942+
%acc2 = phi <4 x i32> [ zeroinitializer, %entry], [ %psum2, %vector.body]
940943
%ptr1 = getelementptr i8, ptr %p1, i64 %iv
941944
%ptr2 = getelementptr i8, ptr %p2, i64 %iv
942945
%ptr3 = getelementptr i8, ptr %p3, i64 %iv
943-
%load1 = load <16 x i8>, ptr %p1, align 1
944-
%load2 = load <16 x i8>, ptr %p2, align 1
945-
%load3 = load <16 x i8>, ptr %p3, align 1
946-
%1 = sext <16 x i8> %load1 to <16 x i32>
947-
%2 = zext <16 x i8> %load3 to <16 x i32>
948-
%3 = mul <16 x i32> %1, %2
949-
%psum1 = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> zeroinitializer, <16 x i32> %3)
950-
%4 = sext <16 x i8> %load2 to <16 x i32>
951-
%5 = mul <16 x i32> %4, %2
952-
%psum2 = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> zeroinitializer, <16 x i32> %5)
946+
%load1 = load <16 x i8>, ptr %ptr1
947+
%load2 = load <16 x i8>, ptr %ptr2
948+
%load3 = load <16 x i8>, ptr %ptr3
949+
%sext1 = sext <16 x i8> %load1 to <16 x i32>
950+
%zext = zext <16 x i8> %load3 to <16 x i32>
951+
%mul1 = mul <16 x i32> %sext1, %zext
952+
%psum1 = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc1, <16 x i32> %mul1)
953+
%sext2 = sext <16 x i8> %load2 to <16 x i32>
954+
%mul2 = mul <16 x i32> %sext2, %zext
955+
%psum2 = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc2, <16 x i32> %mul2)
953956
%iv.next = add i64 %iv, 16
954-
%6 = icmp eq i64 %iv.next, 1024
955-
br i1 %6, label %end, label %vector.body
957+
%1 = icmp eq i64 %iv.next, 1024
958+
br i1 %1, label %end, label %vector.body
956959

957960
end:
958-
%7 = add <4 x i32> %psum2, %psum1
959-
ret <4 x i32> %7
961+
%2 = add <4 x i32> %psum2, %psum1
962+
ret <4 x i32> %2
960963
}

0 commit comments

Comments
 (0)