Skip to content

Commit f704782

Browse files
authored
[AArch64][SelectionDAG] Fix UDOT regression (#144907)
Fix broken check in AArch64ISelLowering for bailing from ZExt optimizations when there is a partial reduction intrinsic.
1 parent 14e89b0 commit f704782

File tree

2 files changed

+59
-37
lines changed

2 files changed

+59
-37
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16869,14 +16869,14 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
1686916869
if (SrcWidth * 4 <= DstWidth) {
1687016870
if (all_of(I->users(), [&](auto *U) {
1687116871
auto *SingleUser = cast<Instruction>(&*U);
16872-
return (
16873-
match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))) ||
16874-
(match(SingleUser,
16875-
m_Intrinsic<
16876-
Intrinsic::experimental_vector_partial_reduce_add>(
16877-
m_Value(), m_Specific(I))) &&
16878-
!shouldExpandPartialReductionIntrinsic(
16879-
cast<IntrinsicInst>(SingleUser))));
16872+
if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
16873+
return true;
16874+
if (match(SingleUser,
16875+
m_Intrinsic<
16876+
Intrinsic::experimental_vector_partial_reduce_add>(
16877+
m_Value(), m_Specific(I))))
16878+
return true;
16879+
return false;
1688016880
}))
1688116881
return false;
1688216882
}

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -772,35 +772,57 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
772772
}
773773

774774
define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){
775-
; CHECK-COMMON-LABEL: udot_no_bin_op_in_loop:
776-
; CHECK-COMMON: // %bb.0: // %entry
777-
; CHECK-COMMON-NEXT: adrp x8, .LCPI16_0
778-
; CHECK-COMMON-NEXT: movi v2.2d, #0000000000000000
779-
; CHECK-COMMON-NEXT: adrp x9, .LCPI16_2
780-
; CHECK-COMMON-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
781-
; CHECK-COMMON-NEXT: adrp x8, .LCPI16_1
782-
; CHECK-COMMON-NEXT: adrp x10, .LCPI16_3
783-
; CHECK-COMMON-NEXT: ldr q3, [x8, :lo12:.LCPI16_1]
784-
; CHECK-COMMON-NEXT: ldr q4, [x9, :lo12:.LCPI16_2]
785-
; CHECK-COMMON-NEXT: ldr q5, [x10, :lo12:.LCPI16_3]
786-
; CHECK-COMMON-NEXT: mov x8, xzr
787-
; CHECK-COMMON-NEXT: .LBB16_1: // %vector.body
788-
; CHECK-COMMON-NEXT: // =>This Inner Loop Header: Depth=1
789-
; CHECK-COMMON-NEXT: ldr q6, [x0, x8]
790-
; CHECK-COMMON-NEXT: mov v0.16b, v2.16b
791-
; CHECK-COMMON-NEXT: add x8, x8, #16
792-
; CHECK-COMMON-NEXT: cmp x8, #16
793-
; CHECK-COMMON-NEXT: tbl v7.16b, { v6.16b }, v3.16b
794-
; CHECK-COMMON-NEXT: tbl v16.16b, { v6.16b }, v4.16b
795-
; CHECK-COMMON-NEXT: tbl v17.16b, { v6.16b }, v5.16b
796-
; CHECK-COMMON-NEXT: tbl v6.16b, { v6.16b }, v1.16b
797-
; CHECK-COMMON-NEXT: add v2.4s, v2.4s, v17.4s
798-
; CHECK-COMMON-NEXT: add v7.4s, v16.4s, v7.4s
799-
; CHECK-COMMON-NEXT: add v2.4s, v2.4s, v7.4s
800-
; CHECK-COMMON-NEXT: add v2.4s, v2.4s, v6.4s
801-
; CHECK-COMMON-NEXT: b.ne .LBB16_1
802-
; CHECK-COMMON-NEXT: // %bb.2: // %end
803-
; CHECK-COMMON-NEXT: ret
775+
; CHECK-NODOT-LABEL: udot_no_bin_op_in_loop:
776+
; CHECK-NODOT: // %bb.0: // %entry
777+
; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000
778+
; CHECK-NODOT-NEXT: mov x8, xzr
779+
; CHECK-NODOT-NEXT: .LBB16_1: // %vector.body
780+
; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1
781+
; CHECK-NODOT-NEXT: ldr q2, [x0, x8]
782+
; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
783+
; CHECK-NODOT-NEXT: add x8, x8, #16
784+
; CHECK-NODOT-NEXT: cmp x8, #16
785+
; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0
786+
; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
787+
; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v3.4h
788+
; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v3.8h
789+
; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v2.4h
790+
; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v2.8h
791+
; CHECK-NODOT-NEXT: b.ne .LBB16_1
792+
; CHECK-NODOT-NEXT: // %bb.2: // %end
793+
; CHECK-NODOT-NEXT: ret
794+
;
795+
; CHECK-DOT-LABEL: udot_no_bin_op_in_loop:
796+
; CHECK-DOT: // %bb.0: // %entry
797+
; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
798+
; CHECK-DOT-NEXT: movi v2.16b, #1
799+
; CHECK-DOT-NEXT: mov x8, xzr
800+
; CHECK-DOT-NEXT: .LBB16_1: // %vector.body
801+
; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1
802+
; CHECK-DOT-NEXT: ldr q3, [x0, x8]
803+
; CHECK-DOT-NEXT: mov v0.16b, v1.16b
804+
; CHECK-DOT-NEXT: add x8, x8, #16
805+
; CHECK-DOT-NEXT: cmp x8, #16
806+
; CHECK-DOT-NEXT: udot v1.4s, v3.16b, v2.16b
807+
; CHECK-DOT-NEXT: b.ne .LBB16_1
808+
; CHECK-DOT-NEXT: // %bb.2: // %end
809+
; CHECK-DOT-NEXT: ret
810+
;
811+
; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_in_loop:
812+
; CHECK-DOT-I8MM: // %bb.0: // %entry
813+
; CHECK-DOT-I8MM-NEXT: movi v1.2d, #0000000000000000
814+
; CHECK-DOT-I8MM-NEXT: movi v2.16b, #1
815+
; CHECK-DOT-I8MM-NEXT: mov x8, xzr
816+
; CHECK-DOT-I8MM-NEXT: .LBB16_1: // %vector.body
817+
; CHECK-DOT-I8MM-NEXT: // =>This Inner Loop Header: Depth=1
818+
; CHECK-DOT-I8MM-NEXT: ldr q3, [x0, x8]
819+
; CHECK-DOT-I8MM-NEXT: mov v0.16b, v1.16b
820+
; CHECK-DOT-I8MM-NEXT: add x8, x8, #16
821+
; CHECK-DOT-I8MM-NEXT: cmp x8, #16
822+
; CHECK-DOT-I8MM-NEXT: udot v1.4s, v3.16b, v2.16b
823+
; CHECK-DOT-I8MM-NEXT: b.ne .LBB16_1
824+
; CHECK-DOT-I8MM-NEXT: // %bb.2: // %end
825+
; CHECK-DOT-I8MM-NEXT: ret
804826
entry:
805827
br label %vector.body
806828

0 commit comments

Comments
 (0)