44define <8 x i8 > @test_avgceil_u (<8 x i16 > %a , <8 x i16 > %b ) {
55; CHECK-LABEL: test_avgceil_u:
66; CHECK: // %bb.0:
7+ ; CHECK-NEXT: bic v0.8h, #255, lsl #8
8+ ; CHECK-NEXT: bic v1.8h, #255, lsl #8
9+ ; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
710; CHECK-NEXT: xtn v0.8b, v0.8h
8- ; CHECK-NEXT: xtn v1.8b, v1.8h
9- ; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
1011; CHECK-NEXT: ret
1112 %mask = insertelement <8 x i16 > poison, i16 255 , i32 0
1213 %mask.splat = shufflevector <8 x i16 > %mask , <8 x i16 > poison, <8 x i32 > zeroinitializer
1314 %ta16 = and <8 x i16 > %a , %mask.splat
1415 %tb16 = and <8 x i16 > %b , %mask.splat
15- %ta8 = trunc <8 x i16 > %ta16 to <8 x i8 >
16- %tb8 = trunc <8 x i16 > %tb16 to <8 x i8 >
17- %res = call <8 x i8 > @llvm.aarch64.neon.uhadd.v8i8 (<8 x i8 > %ta8 , <8 x i8 > %tb8 )
16+ %avg16 = call <8 x i16 > @llvm.aarch64.neon.uhadd.v8i16 (<8 x i16 > %ta16 , <8 x i16 > %tb16 )
17+ %res = trunc <8 x i16 > %avg16 to <8 x i8 >
1818 ret <8 x i8 > %res
1919}
2020
2121define <8 x i8 > @test_avgceil_s (<8 x i16 > %a , <8 x i16 > %b ) {
2222; CHECK-LABEL: test_avgceil_s:
2323; CHECK: // %bb.0:
24- ; CHECK-NEXT: sqxtn v0.8b, v0.8h
25- ; CHECK-NEXT: sqxtn v1.8b, v1.8h
26- ; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
24+ ; CHECK-NEXT: movi v2.8h, #127
25+ ; CHECK-NEXT: mvni v3.8h, #127
26+ ; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
27+ ; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
28+ ; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
29+ ; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
30+ ; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
31+ ; CHECK-NEXT: xtn v0.8b, v0.8h
2732; CHECK-NEXT: ret
2833 %min = insertelement <8 x i16 > poison, i16 -128 , i32 0
2934 %min.splat = shufflevector <8 x i16 > %min , <8 x i16 > poison, <8 x i32 > zeroinitializer
@@ -33,35 +38,39 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
3338 %ta16.clamped = call <8 x i16 > @llvm.smax.v8i16 (<8 x i16 > %ta16 , <8 x i16 > %min.splat )
3439 %tb16 = call <8 x i16 > @llvm.smin.v8i16 (<8 x i16 > %b , <8 x i16 > %max.splat )
3540 %tb16.clamped = call <8 x i16 > @llvm.smax.v8i16 (<8 x i16 > %tb16 , <8 x i16 > %min.splat )
36- %ta8 = trunc <8 x i16 > %ta16.clamped to <8 x i8 >
37- %tb8 = trunc <8 x i16 > %tb16.clamped to <8 x i8 >
38- %res = call <8 x i8 > @llvm.aarch64.neon.shadd.v8i8 (<8 x i8 > %ta8 , <8 x i8 > %tb8 )
41+ %avg16 = call <8 x i16 > @llvm.aarch64.neon.shadd.v8i16 (<8 x i16 > %ta16.clamped , <8 x i16 > %tb16.clamped )
42+ %res = trunc <8 x i16 > %avg16 to <8 x i8 >
3943 ret <8 x i8 > %res
4044}
4145
4246define <8 x i8 > @test_avgfloor_u (<8 x i16 > %a , <8 x i16 > %b ) {
4347; CHECK-LABEL: test_avgfloor_u:
4448; CHECK: // %bb.0:
49+ ; CHECK-NEXT: bic v0.8h, #255, lsl #8
50+ ; CHECK-NEXT: bic v1.8h, #255, lsl #8
51+ ; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
4552; CHECK-NEXT: xtn v0.8b, v0.8h
46- ; CHECK-NEXT: xtn v1.8b, v1.8h
47- ; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
4853; CHECK-NEXT: ret
4954 %mask = insertelement <8 x i16 > poison, i16 255 , i32 0
5055 %mask.splat = shufflevector <8 x i16 > %mask , <8 x i16 > poison, <8 x i32 > zeroinitializer
5156 %ta16 = and <8 x i16 > %a , %mask.splat
5257 %tb16 = and <8 x i16 > %b , %mask.splat
53- %ta8 = trunc <8 x i16 > %ta16 to <8 x i8 >
54- %tb8 = trunc <8 x i16 > %tb16 to <8 x i8 >
55- %res = call <8 x i8 > @llvm.aarch64.neon.uhadd.v8i8 (<8 x i8 > %ta8 , <8 x i8 > %tb8 )
58+ %avg16 = call <8 x i16 > @llvm.aarch64.neon.uhadd.v8i16 (<8 x i16 > %ta16 , <8 x i16 > %tb16 )
59+ %res = trunc <8 x i16 > %avg16 to <8 x i8 >
5660 ret <8 x i8 > %res
5761}
5862
5963define <8 x i8 > @test_avgfloor_s (<8 x i16 > %a , <8 x i16 > %b ) {
6064; CHECK-LABEL: test_avgfloor_s:
6165; CHECK: // %bb.0:
62- ; CHECK-NEXT: sqxtn v0.8b, v0.8h
63- ; CHECK-NEXT: sqxtn v1.8b, v1.8h
64- ; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
66+ ; CHECK-NEXT: movi v2.8h, #127
67+ ; CHECK-NEXT: mvni v3.8h, #127
68+ ; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
69+ ; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
70+ ; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
71+ ; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
72+ ; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
73+ ; CHECK-NEXT: xtn v0.8b, v0.8h
6574; CHECK-NEXT: ret
6675 %min = insertelement <8 x i16 > poison, i16 -128 , i32 0
6776 %min.splat = shufflevector <8 x i16 > %min , <8 x i16 > poison, <8 x i32 > zeroinitializer
@@ -71,9 +80,8 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
7180 %ta16.clamped = call <8 x i16 > @llvm.smax.v8i16 (<8 x i16 > %ta16 , <8 x i16 > %min.splat )
7281 %tb16 = call <8 x i16 > @llvm.smin.v8i16 (<8 x i16 > %b , <8 x i16 > %max.splat )
7382 %tb16.clamped = call <8 x i16 > @llvm.smax.v8i16 (<8 x i16 > %tb16 , <8 x i16 > %min.splat )
74- %ta8 = trunc <8 x i16 > %ta16.clamped to <8 x i8 >
75- %tb8 = trunc <8 x i16 > %tb16.clamped to <8 x i8 >
76- %res = call <8 x i8 > @llvm.aarch64.neon.shadd.v8i8 (<8 x i8 > %ta8 , <8 x i8 > %tb8 )
83+ %avg16 = call <8 x i16 > @llvm.aarch64.neon.shadd.v8i16 (<8 x i16 > %ta16.clamped , <8 x i16 > %tb16.clamped )
84+ %res = trunc <8 x i16 > %avg16 to <8 x i8 >
7785 ret <8 x i8 > %res
7886}
7987
0 commit comments