Skip to content

Commit b29c7de

Browse files
authored
[AArch64] Fold uaddv(a) to a if the all lanes except the 0th are zeros (#159086)
Fixes #158741
1 parent 948482d commit b29c7de

File tree

5 files changed

+287
-10
lines changed

5 files changed

+287
-10
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19076,6 +19076,18 @@ static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
1907619076
else if (SDValue R = performUADDVZextCombine(A, DAG))
1907719077
return R;
1907819078
}
19079+
19080+
// uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19081+
MVT VT = N->getSimpleValueType(0);
19082+
MVT OpVT = A.getSimpleValueType();
19083+
assert(VT == OpVT &&
19084+
"The operand type should be consistent with the result type of UADDV");
19085+
APInt Mask = APInt::getAllOnes(OpVT.getVectorNumElements());
19086+
Mask.clearBit(0);
19087+
KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
19088+
if (KnownLeadingLanes.isZero())
19089+
return A;
19090+
1907919091
return SDValue();
1908019092
}
1908119093

llvm/test/CodeGen/AArch64/aarch64-addv.ll

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,3 +493,154 @@ entry:
493493
ret i128 %arg1
494494
}
495495

496+
define i16 @addv_zero_lanes_v4i16(ptr %arr) {
497+
; CHECK-SD-LABEL: addv_zero_lanes_v4i16:
498+
; CHECK-SD: // %bb.0:
499+
; CHECK-SD-NEXT: ldrb w0, [x0]
500+
; CHECK-SD-NEXT: ret
501+
;
502+
; CHECK-GI-LABEL: addv_zero_lanes_v4i16:
503+
; CHECK-GI: // %bb.0:
504+
; CHECK-GI-NEXT: ldrb w8, [x0]
505+
; CHECK-GI-NEXT: fmov d0, x8
506+
; CHECK-GI-NEXT: addv h0, v0.4h
507+
; CHECK-GI-NEXT: fmov w0, s0
508+
; CHECK-GI-NEXT: ret
509+
%v = load i64, ptr %arr
510+
%and = and i64 %v, 255
511+
%vec = bitcast i64 %and to <4 x i16>
512+
%r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %vec)
513+
ret i16 %r
514+
}
515+
516+
define i8 @addv_zero_lanes_v8i8(ptr %arr) {
517+
; CHECK-SD-LABEL: addv_zero_lanes_v8i8:
518+
; CHECK-SD: // %bb.0:
519+
; CHECK-SD-NEXT: ldrb w0, [x0]
520+
; CHECK-SD-NEXT: ret
521+
;
522+
; CHECK-GI-LABEL: addv_zero_lanes_v8i8:
523+
; CHECK-GI: // %bb.0:
524+
; CHECK-GI-NEXT: ldrb w8, [x0]
525+
; CHECK-GI-NEXT: fmov d0, x8
526+
; CHECK-GI-NEXT: addv b0, v0.8b
527+
; CHECK-GI-NEXT: fmov w0, s0
528+
; CHECK-GI-NEXT: ret
529+
%v = load i64, ptr %arr
530+
%and = and i64 %v, 255
531+
%vec = bitcast i64 %and to <8 x i8>
532+
%r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %vec)
533+
ret i8 %r
534+
}
535+
536+
define i8 @addv_zero_lanes_negative_v8i8(ptr %arr) {
537+
; CHECK-LABEL: addv_zero_lanes_negative_v8i8:
538+
; CHECK: // %bb.0:
539+
; CHECK-NEXT: ldr x8, [x0]
540+
; CHECK-NEXT: and x8, x8, #0x100
541+
; CHECK-NEXT: fmov d0, x8
542+
; CHECK-NEXT: addv b0, v0.8b
543+
; CHECK-NEXT: fmov w0, s0
544+
; CHECK-NEXT: ret
545+
%v = load i64, ptr %arr
546+
%and = and i64 %v, 256
547+
%vec = bitcast i64 %and to <8 x i8>
548+
%r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %vec)
549+
ret i8 %r
550+
}
551+
552+
553+
define i8 @addv_zero_lanes_v16i8(ptr %arr) {
554+
; CHECK-SD-LABEL: addv_zero_lanes_v16i8:
555+
; CHECK-SD: // %bb.0:
556+
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
557+
; CHECK-SD-NEXT: ldrb w8, [x0]
558+
; CHECK-SD-NEXT: mov v0.d[0], x8
559+
; CHECK-SD-NEXT: addv b0, v0.16b
560+
; CHECK-SD-NEXT: fmov w0, s0
561+
; CHECK-SD-NEXT: ret
562+
;
563+
; CHECK-GI-LABEL: addv_zero_lanes_v16i8:
564+
; CHECK-GI: // %bb.0:
565+
; CHECK-GI-NEXT: ldrb w8, [x0]
566+
; CHECK-GI-NEXT: mov v0.d[0], x8
567+
; CHECK-GI-NEXT: mov v0.d[1], xzr
568+
; CHECK-GI-NEXT: addv b0, v0.16b
569+
; CHECK-GI-NEXT: fmov w0, s0
570+
; CHECK-GI-NEXT: ret
571+
%v = load i128, ptr %arr
572+
%and = and i128 %v, 255
573+
%vec = bitcast i128 %and to <16 x i8>
574+
%r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %vec)
575+
ret i8 %r
576+
}
577+
578+
define i16 @addv_zero_lanes_v8i16(ptr %arr) {
579+
; CHECK-SD-LABEL: addv_zero_lanes_v8i16:
580+
; CHECK-SD: // %bb.0:
581+
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
582+
; CHECK-SD-NEXT: ldrh w8, [x0]
583+
; CHECK-SD-NEXT: mov v0.d[0], x8
584+
; CHECK-SD-NEXT: addv h0, v0.8h
585+
; CHECK-SD-NEXT: fmov w0, s0
586+
; CHECK-SD-NEXT: ret
587+
;
588+
; CHECK-GI-LABEL: addv_zero_lanes_v8i16:
589+
; CHECK-GI: // %bb.0:
590+
; CHECK-GI-NEXT: ldrh w8, [x0]
591+
; CHECK-GI-NEXT: mov v0.d[0], x8
592+
; CHECK-GI-NEXT: mov v0.d[1], xzr
593+
; CHECK-GI-NEXT: addv h0, v0.8h
594+
; CHECK-GI-NEXT: fmov w0, s0
595+
; CHECK-GI-NEXT: ret
596+
%v = load i128, ptr %arr
597+
%and = and i128 %v, u0xFFFF
598+
%vec = bitcast i128 %and to <8 x i16>
599+
%r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %vec)
600+
ret i16 %r
601+
}
602+
603+
define i32 @addv_zero_lanes_v4i32(ptr %arr) {
604+
; CHECK-SD-LABEL: addv_zero_lanes_v4i32:
605+
; CHECK-SD: // %bb.0:
606+
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
607+
; CHECK-SD-NEXT: ldr w8, [x0]
608+
; CHECK-SD-NEXT: mov v0.d[0], x8
609+
; CHECK-SD-NEXT: addv s0, v0.4s
610+
; CHECK-SD-NEXT: fmov w0, s0
611+
; CHECK-SD-NEXT: ret
612+
;
613+
; CHECK-GI-LABEL: addv_zero_lanes_v4i32:
614+
; CHECK-GI: // %bb.0:
615+
; CHECK-GI-NEXT: ldr w8, [x0]
616+
; CHECK-GI-NEXT: mov v0.d[0], x8
617+
; CHECK-GI-NEXT: mov v0.d[1], xzr
618+
; CHECK-GI-NEXT: addv s0, v0.4s
619+
; CHECK-GI-NEXT: fmov w0, s0
620+
; CHECK-GI-NEXT: ret
621+
%v = load i128, ptr %arr
622+
%and = and i128 %v, u0xFFFFFFFF
623+
%vec = bitcast i128 %and to <4 x i32>
624+
%r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %vec)
625+
ret i32 %r
626+
}
627+
628+
define i32 @addv_zero_lanes_v2i32(ptr %arr) {
629+
; CHECK-SD-LABEL: addv_zero_lanes_v2i32:
630+
; CHECK-SD: // %bb.0:
631+
; CHECK-SD-NEXT: ldr w0, [x0]
632+
; CHECK-SD-NEXT: ret
633+
;
634+
; CHECK-GI-LABEL: addv_zero_lanes_v2i32:
635+
; CHECK-GI: // %bb.0:
636+
; CHECK-GI-NEXT: ldr w8, [x0]
637+
; CHECK-GI-NEXT: fmov d0, x8
638+
; CHECK-GI-NEXT: addp v0.2s, v0.2s, v0.2s
639+
; CHECK-GI-NEXT: fmov w0, s0
640+
; CHECK-GI-NEXT: ret
641+
%v = load i64, ptr %arr
642+
%and = and i64 %v, u0xFFFFFFFF
643+
%vec = bitcast i64 %and to <2 x i32>
644+
%r = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %vec)
645+
ret i32 %r
646+
}

llvm/test/CodeGen/AArch64/abds.ll

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -496,13 +496,9 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind {
496496
define i64 @vector_legalized(i16 %a, i16 %b) {
497497
; CHECK-LABEL: vector_legalized:
498498
; CHECK: // %bb.0:
499-
; CHECK-NEXT: movi v0.2d, #0000000000000000
500499
; CHECK-NEXT: sxth w8, w0
501500
; CHECK-NEXT: subs w8, w8, w1, sxth
502-
; CHECK-NEXT: addp d0, v0.2d
503-
; CHECK-NEXT: cneg w8, w8, mi
504-
; CHECK-NEXT: fmov x9, d0
505-
; CHECK-NEXT: add x0, x9, x8
501+
; CHECK-NEXT: cneg w0, w8, mi
506502
; CHECK-NEXT: ret
507503
%ea = sext i16 %a to i32
508504
%eb = sext i16 %b to i32

llvm/test/CodeGen/AArch64/abdu.ll

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -361,13 +361,9 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
361361
define i64 @vector_legalized(i16 %a, i16 %b) {
362362
; CHECK-LABEL: vector_legalized:
363363
; CHECK: // %bb.0:
364-
; CHECK-NEXT: movi v0.2d, #0000000000000000
365364
; CHECK-NEXT: and w8, w0, #0xffff
366365
; CHECK-NEXT: subs w8, w8, w1, uxth
367-
; CHECK-NEXT: cneg w8, w8, mi
368-
; CHECK-NEXT: addp d0, v0.2d
369-
; CHECK-NEXT: fmov x9, d0
370-
; CHECK-NEXT: add x0, x9, x8
366+
; CHECK-NEXT: cneg w0, w8, mi
371367
; CHECK-NEXT: ret
372368
%ea = zext i16 %a to i32
373369
%eb = zext i16 %b to i32

llvm/test/CodeGen/AArch64/ctpop.ll

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,3 +505,125 @@ entry:
505505
%s = call <4 x i128> @llvm.ctpop(<4 x i128> %d)
506506
ret <4 x i128> %s
507507
}
508+
509+
define i8 @i8(i8 %x) {
510+
; CHECK-SD-LABEL: i8:
511+
; CHECK-SD: // %bb.0: // %entry
512+
; CHECK-SD-NEXT: and w8, w0, #0xff
513+
; CHECK-SD-NEXT: fmov s0, w8
514+
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
515+
; CHECK-SD-NEXT: fmov w0, s0
516+
; CHECK-SD-NEXT: ret
517+
;
518+
; CHECK-GI-LABEL: i8:
519+
; CHECK-GI: // %bb.0: // %entry
520+
; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
521+
; CHECK-GI-NEXT: and x8, x0, #0xff
522+
; CHECK-GI-NEXT: fmov d0, x8
523+
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
524+
; CHECK-GI-NEXT: uaddlv h0, v0.8b
525+
; CHECK-GI-NEXT: fmov w0, s0
526+
; CHECK-GI-NEXT: ret
527+
entry:
528+
%s = call i8 @llvm.ctpop.i8(i8 %x)
529+
ret i8 %s
530+
}
531+
532+
define i16 @i16_mask(i16 %x) {
533+
; CHECK-SD-LABEL: i16_mask:
534+
; CHECK-SD: // %bb.0: // %entry
535+
; CHECK-SD-NEXT: and w8, w0, #0xff
536+
; CHECK-SD-NEXT: fmov s0, w8
537+
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
538+
; CHECK-SD-NEXT: fmov w0, s0
539+
; CHECK-SD-NEXT: ret
540+
;
541+
; CHECK-GI-LABEL: i16_mask:
542+
; CHECK-GI: // %bb.0: // %entry
543+
; CHECK-GI-NEXT: and w8, w0, #0xff
544+
; CHECK-GI-NEXT: and x8, x8, #0xffff
545+
; CHECK-GI-NEXT: fmov d0, x8
546+
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
547+
; CHECK-GI-NEXT: uaddlv h0, v0.8b
548+
; CHECK-GI-NEXT: fmov w0, s0
549+
; CHECK-GI-NEXT: ret
550+
entry:
551+
%and = and i16 %x, 255
552+
%s = call i16 @llvm.ctpop.i16(i16 %and)
553+
ret i16 %s
554+
}
555+
556+
define i32 @i32_mask(i32 %x) {
557+
; CHECK-SD-LABEL: i32_mask:
558+
; CHECK-SD: // %bb.0: // %entry
559+
; CHECK-SD-NEXT: and w8, w0, #0xff
560+
; CHECK-SD-NEXT: fmov s0, w8
561+
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
562+
; CHECK-SD-NEXT: fmov w0, s0
563+
; CHECK-SD-NEXT: ret
564+
;
565+
; CHECK-GI-LABEL: i32_mask:
566+
; CHECK-GI: // %bb.0: // %entry
567+
; CHECK-GI-NEXT: and w8, w0, #0xff
568+
; CHECK-GI-NEXT: fmov s0, w8
569+
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
570+
; CHECK-GI-NEXT: uaddlv h0, v0.8b
571+
; CHECK-GI-NEXT: fmov w0, s0
572+
; CHECK-GI-NEXT: ret
573+
entry:
574+
%and = and i32 %x, 255
575+
%s = call i32 @llvm.ctpop.i32(i32 %and)
576+
ret i32 %s
577+
}
578+
579+
define i32 @i32_mask_negative(i32 %x) {
580+
; CHECK-SD-LABEL: i32_mask_negative:
581+
; CHECK-SD: // %bb.0: // %entry
582+
; CHECK-SD-NEXT: and w8, w0, #0xffff
583+
; CHECK-SD-NEXT: fmov s0, w8
584+
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
585+
; CHECK-SD-NEXT: addv b0, v0.8b
586+
; CHECK-SD-NEXT: fmov w0, s0
587+
; CHECK-SD-NEXT: ret
588+
;
589+
; CHECK-GI-LABEL: i32_mask_negative:
590+
; CHECK-GI: // %bb.0: // %entry
591+
; CHECK-GI-NEXT: and w8, w0, #0xffff
592+
; CHECK-GI-NEXT: fmov s0, w8
593+
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
594+
; CHECK-GI-NEXT: uaddlv h0, v0.8b
595+
; CHECK-GI-NEXT: fmov w0, s0
596+
; CHECK-GI-NEXT: ret
597+
entry:
598+
%and = and i32 %x, 65535
599+
%s = call i32 @llvm.ctpop.i32(i32 %and)
600+
ret i32 %s
601+
}
602+
603+
define i128 @i128_mask(i128 %x) {
604+
; CHECK-SD-LABEL: i128_mask:
605+
; CHECK-SD: // %bb.0: // %entry
606+
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
607+
; CHECK-SD-NEXT: and x8, x0, #0xff
608+
; CHECK-SD-NEXT: mov x1, xzr
609+
; CHECK-SD-NEXT: mov v0.d[0], x8
610+
; CHECK-SD-NEXT: cnt v0.16b, v0.16b
611+
; CHECK-SD-NEXT: addv b0, v0.16b
612+
; CHECK-SD-NEXT: fmov x0, d0
613+
; CHECK-SD-NEXT: ret
614+
;
615+
; CHECK-GI-LABEL: i128_mask:
616+
; CHECK-GI: // %bb.0: // %entry
617+
; CHECK-GI-NEXT: and x8, x0, #0xff
618+
; CHECK-GI-NEXT: mov x1, xzr
619+
; CHECK-GI-NEXT: mov v0.d[0], x8
620+
; CHECK-GI-NEXT: mov v0.d[1], xzr
621+
; CHECK-GI-NEXT: cnt v0.16b, v0.16b
622+
; CHECK-GI-NEXT: uaddlv h0, v0.16b
623+
; CHECK-GI-NEXT: mov w0, v0.s[0]
624+
; CHECK-GI-NEXT: ret
625+
entry:
626+
%and = and i128 %x, 255
627+
%s = call i128 @llvm.ctpop.i128(i128 %and)
628+
ret i128 %s
629+
}

0 commit comments

Comments
 (0)