Skip to content

Commit 2d268fc

Browse files
[DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits-8
1 parent 44609a3 commit 2d268fc

File tree

1 file changed

+27
-54
lines changed

1 file changed

+27
-54
lines changed
Lines changed: 27 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,80 +1,53 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
22
; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
33

4-
define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
5-
; CHECK-LABEL: test_avgceil_u:
4+
define <8 x i8> @avgceil_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) {
5+
; CHECK-LABEL: avgceil_u_i8_to_i16:
66
; CHECK: // %bb.0:
7-
; CHECK-NEXT: bic v0.8h, #255, lsl #8
8-
; CHECK-NEXT: bic v1.8h, #255, lsl #8
9-
; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
10-
; CHECK-NEXT: xtn v0.8b, v0.8h
7+
; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b
118
; CHECK-NEXT: ret
12-
%ta16 = and <8 x i16> %a, splat (i16 255)
13-
%tb16 = and <8 x i16> %b, splat (i16 255)
14-
%avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
15-
%res = trunc <8 x i16> %avg16 to <8 x i8>
16-
ret <8 x i8> %res
9+
%a16 = zext <8 x i8> %a to <8 x i16>
10+
%b16 = zext <8 x i8> %b to <8 x i16>
11+
%avg16 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
12+
%r = trunc <8 x i16> %avg16 to <8 x i8>
13+
ret <8 x i8> %r
1714
}
1815

19-
define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
16+
17+
define <8 x i8> @test_avgceil_s(<8 x i8> %a, <8 x i8> %b) {
2018
; CHECK-LABEL: test_avgceil_s:
2119
; CHECK: // %bb.0:
22-
; CHECK-NEXT: movi v2.8h, #127
23-
; CHECK-NEXT: mvni v3.8h, #127
24-
; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
25-
; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
26-
; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
27-
; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
28-
; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
29-
; CHECK-NEXT: xtn v0.8b, v0.8h
20+
; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b
3021
; CHECK-NEXT: ret
31-
%ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 127))
32-
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> splat (i16 -128))
33-
%tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> splat (i16 127))
34-
%tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> splat (i16 -128))
35-
%avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
36-
%res = trunc <8 x i16> %avg16 to <8 x i8>
22+
%a16 = sext <8 x i8> %a to <8 x i16>
23+
%b16 = sext <8 x i8> %b to <8 x i16>
24+
%avg16 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
25+
%res = trunc <8 x i16> %avg16 to <8 x i8>
3726
ret <8 x i8> %res
3827
}
3928

40-
define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
41-
; CHECK-LABEL: test_avgfloor_u:
29+
define <8 x i8> @avgfloor_u_from_intrin(<8 x i8> %a, <8 x i8> %b) {
30+
; CHECK-LABEL: avgfloor_u_from_intrin:
4231
; CHECK: // %bb.0:
43-
; CHECK-NEXT: bic v0.8h, #255, lsl #8
44-
; CHECK-NEXT: bic v1.8h, #255, lsl #8
45-
; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
46-
; CHECK-NEXT: xtn v0.8b, v0.8h
32+
; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
4733
; CHECK-NEXT: ret
48-
%ta16 = and <8 x i16> %a, splat (i16 255)
49-
%tb16 = and <8 x i16> %b, splat (i16 255)
50-
%avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
34+
%a16 = zext <8 x i8> %a to <8 x i16>
35+
%b16 = zext <8 x i8> %b to <8 x i16>
36+
%avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
5137
%res = trunc <8 x i16> %avg16 to <8 x i8>
5238
ret <8 x i8> %res
5339
}
5440

55-
define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
41+
define <8 x i8> @test_avgfloor_s(<8 x i8> %a, <8 x i8> %b) {
5642
; CHECK-LABEL: test_avgfloor_s:
5743
; CHECK: // %bb.0:
58-
; CHECK-NEXT: movi v2.8h, #127
59-
; CHECK-NEXT: mvni v3.8h, #127
60-
; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
61-
; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
62-
; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
63-
; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
64-
; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
65-
; CHECK-NEXT: xtn v0.8b, v0.8h
44+
; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
6645
; CHECK-NEXT: ret
67-
%ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 127))
68-
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> splat (i16 -128))
69-
%tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> splat (i16 127))
70-
%tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> splat (i16 -128))
71-
%avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
72-
%res = trunc <8 x i16> %avg16 to <8 x i8>
46+
%a16 = sext <8 x i8> %a to <8 x i16>
47+
%b16 = sext <8 x i8> %b to <8 x i16>
48+
%avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
49+
%res = trunc <8 x i16> %avg16 to <8 x i8>
7350
ret <8 x i8> %res
7451
}
7552

76-
declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>)
77-
declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>)
78-
declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
79-
declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
8053

0 commit comments

Comments
 (0)