Skip to content

Commit 08138a2

Browse files
[DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits-5
1 parent 1115256 commit 08138a2

File tree

1 file changed

+30
-22
lines changed

1 file changed

+30
-22
lines changed

llvm/test/CodeGen/AArch64/trunc-avg-fold.ll

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,31 @@
44
define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
55
; CHECK-LABEL: test_avgceil_u:
66
; CHECK: // %bb.0:
7+
; CHECK-NEXT: bic v0.8h, #255, lsl #8
8+
; CHECK-NEXT: bic v1.8h, #255, lsl #8
9+
; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
710
; CHECK-NEXT: xtn v0.8b, v0.8h
8-
; CHECK-NEXT: xtn v1.8b, v1.8h
9-
; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
1011
; CHECK-NEXT: ret
1112
%mask = insertelement <8 x i16> poison, i16 255, i32 0
1213
%mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
1314
%ta16 = and <8 x i16> %a, %mask.splat
1415
%tb16 = and <8 x i16> %b, %mask.splat
15-
%ta8 = trunc <8 x i16> %ta16 to <8 x i8>
16-
%tb8 = trunc <8 x i16> %tb16 to <8 x i8>
17-
%res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
16+
%avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
17+
%res = trunc <8 x i16> %avg16 to <8 x i8>
1818
ret <8 x i8> %res
1919
}
2020

2121
define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
2222
; CHECK-LABEL: test_avgceil_s:
2323
; CHECK: // %bb.0:
24-
; CHECK-NEXT: sqxtn v0.8b, v0.8h
25-
; CHECK-NEXT: sqxtn v1.8b, v1.8h
26-
; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
24+
; CHECK-NEXT: movi v2.8h, #127
25+
; CHECK-NEXT: mvni v3.8h, #127
26+
; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
27+
; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
28+
; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
29+
; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
30+
; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
31+
; CHECK-NEXT: xtn v0.8b, v0.8h
2732
; CHECK-NEXT: ret
2833
%min = insertelement <8 x i16> poison, i16 -128, i32 0
2934
%min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -33,35 +38,39 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
3338
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
3439
%tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
3540
%tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
36-
%ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
37-
%tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
38-
%res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
41+
%avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
42+
%res = trunc <8 x i16> %avg16 to <8 x i8>
3943
ret <8 x i8> %res
4044
}
4145

4246
define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
4347
; CHECK-LABEL: test_avgfloor_u:
4448
; CHECK: // %bb.0:
49+
; CHECK-NEXT: bic v0.8h, #255, lsl #8
50+
; CHECK-NEXT: bic v1.8h, #255, lsl #8
51+
; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
4552
; CHECK-NEXT: xtn v0.8b, v0.8h
46-
; CHECK-NEXT: xtn v1.8b, v1.8h
47-
; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
4853
; CHECK-NEXT: ret
4954
%mask = insertelement <8 x i16> poison, i16 255, i32 0
5055
%mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
5156
%ta16 = and <8 x i16> %a, %mask.splat
5257
%tb16 = and <8 x i16> %b, %mask.splat
53-
%ta8 = trunc <8 x i16> %ta16 to <8 x i8>
54-
%tb8 = trunc <8 x i16> %tb16 to <8 x i8>
55-
%res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
58+
%avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
59+
%res = trunc <8 x i16> %avg16 to <8 x i8>
5660
ret <8 x i8> %res
5761
}
5862

5963
define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
6064
; CHECK-LABEL: test_avgfloor_s:
6165
; CHECK: // %bb.0:
62-
; CHECK-NEXT: sqxtn v0.8b, v0.8h
63-
; CHECK-NEXT: sqxtn v1.8b, v1.8h
64-
; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
66+
; CHECK-NEXT: movi v2.8h, #127
67+
; CHECK-NEXT: mvni v3.8h, #127
68+
; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
69+
; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
70+
; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
71+
; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
72+
; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
73+
; CHECK-NEXT: xtn v0.8b, v0.8h
6574
; CHECK-NEXT: ret
6675
%min = insertelement <8 x i16> poison, i16 -128, i32 0
6776
%min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -71,9 +80,8 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
7180
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
7281
%tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
7382
%tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
74-
%ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
75-
%tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
76-
%res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
83+
%avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
84+
%res = trunc <8 x i16> %avg16 to <8 x i8>
7785
ret <8 x i8> %res
7886
}
7987

0 commit comments

Comments
 (0)