Skip to content

Commit a6f4448

Browse files
committed
[AArch64][ARM] Add new tests for tbl/tbx optimizations
1 parent c52757d commit a6f4448

File tree

4 files changed

+490
-100
lines changed

4 files changed

+490
-100
lines changed
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt < %s -passes=instcombine -S | FileCheck %s
3+
4+
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
5+
target triple = "aarch64"
6+
7+
; We can turn a tbl/tbx intrinsic into a shufflevector instruction if the mask
8+
; is constant and references 2 or fewer operands.
9+
10+
; Basic tbl1 with all in-bounds indices should optimize to shufflevector.
11+
define <16 x i8> @tbl1_basic(<16 x i8> %a) {
12+
; CHECK-LABEL: @tbl1_basic(
13+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
14+
; CHECK-NEXT: ret <16 x i8> [[TMP1]]
15+
;
16+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
17+
ret <16 x i8> %tbl
18+
}
19+
20+
; tbl2 with both operands the same should optimize (1 unique source).
21+
define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) {
22+
; CHECK-LABEL: @tbl2_duplicate_operands(
23+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
24+
; CHECK-NEXT: ret <16 x i8> [[TMP1]]
25+
;
26+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
27+
ret <16 x i8> %tbl
28+
}
29+
30+
; tbl4 with alternating duplicate operands should optimize (2 unique sources).
31+
define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) {
32+
; CHECK-LABEL: @tbl4_duplicate_operands(
33+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
34+
; CHECK-NEXT: ret <16 x i8> [[TMP1]]
35+
;
36+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
37+
ret <16 x i8> %tbl
38+
}
39+
40+
; tbl4 where mask only references first two operands should optimize.
41+
define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
42+
; CHECK-LABEL: @tbl4_unused_operands(
43+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
44+
; CHECK-NEXT: ret <16 x i8> [[TMP1]]
45+
;
46+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
47+
ret <16 x i8> %tbl
48+
}
49+
50+
; tbl4 where mask only references one operand should optimize.
51+
define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
52+
; CHECK-LABEL: @tbl4_single_operand_used(
53+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
54+
; CHECK-NEXT: ret <16 x i8> [[TMP1]]
55+
;
56+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
57+
ret <16 x i8> %tbl
58+
}
59+
60+
; tbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources).
61+
define <16 x i8> @tbl1_with_oob(<16 x i8> %a) {
62+
; CHECK-LABEL: @tbl1_with_oob(
63+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
64+
; CHECK-NEXT: ret <16 x i8> [[TMP1]]
65+
;
66+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
67+
ret <16 x i8> %tbl
68+
}
69+
70+
; tbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources).
71+
define <16 x i8> @tbl2_duplicate_with_oob(<16 x i8> %a) {
72+
; CHECK-LABEL: @tbl2_duplicate_with_oob(
73+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
74+
; CHECK-NEXT: ret <16 x i8> [[TMP1]]
75+
;
76+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
77+
ret <16 x i8> %tbl
78+
}
79+
80+
; tbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources).
81+
define <16 x i8> @tbl2_with_oob_bail(<16 x i8> %a, <16 x i8> %b) {
82+
; CHECK-LABEL: @tbl2_with_oob_bail(
83+
; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
84+
; CHECK-NEXT: ret <16 x i8> [[TBL]]
85+
;
86+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
87+
ret <16 x i8> %tbl
88+
}
89+
90+
; tbl1 with all OOB indices should optimize to zero vector.
91+
define <16 x i8> @tbl1_all_oob(<16 x i8> %a) {
92+
; CHECK-LABEL: @tbl1_all_oob(
93+
; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> splat (i8 99))
94+
; CHECK-NEXT: ret <16 x i8> [[TBL]]
95+
;
96+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
97+
ret <16 x i8> %tbl
98+
}
99+
100+
; tbl3 referencing all 3 operands should NOT optimize.
101+
define <16 x i8> @tbl3_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
102+
; CHECK-LABEL: @tbl3_three_sources_bail(
103+
; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
104+
; CHECK-NEXT: ret <16 x i8> [[TBL]]
105+
;
106+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
107+
ret <16 x i8> %tbl
108+
}
109+
110+
; tbl4 referencing 3 unique operands should NOT optimize.
111+
define <16 x i8> @tbl4_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
112+
; CHECK-LABEL: @tbl4_three_sources_bail(
113+
; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
114+
; CHECK-NEXT: ret <16 x i8> [[TBL]]
115+
;
116+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
117+
ret <16 x i8> %tbl
118+
}
119+
120+
; tbl4 referencing all 4 unique operands should NOT optimize.
121+
define <16 x i8> @tbl4_four_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
122+
; CHECK-LABEL: @tbl4_four_sources_bail(
123+
; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
124+
; CHECK-NEXT: ret <16 x i8> [[TBL]]
125+
;
126+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
127+
ret <16 x i8> %tbl
128+
}
129+
130+
; tbx1 with no OOB should optimize.
131+
define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) {
132+
; CHECK-LABEL: @tbx1_no_oob(
133+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
134+
; CHECK-NEXT: ret <16 x i8> [[TMP1]]
135+
;
136+
%tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
137+
ret <16 x i8> %tbx
138+
}
139+
140+
; tbx2 where fallback == second source operand should optimize (deduplicated).
141+
define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b) {
142+
; CHECK-LABEL: @tbx2_fallback_equals_second_source(
143+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
144+
; CHECK-NEXT: ret <16 x i8> [[TMP1]]
145+
;
146+
%tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
147+
ret <16 x i8> %tbx
148+
}
149+
150+
; tbx1 with OOB where fallback == source should optimize (deduplicated).
151+
define <16 x i8> @tbx1_oob_fallback_same_as_source(<16 x i8> %a) {
152+
; CHECK-LABEL: @tbx1_oob_fallback_same_as_source(
153+
; CHECK-NEXT: [[A:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A1:%.*]], <16 x i8> [[A1]], <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
154+
; CHECK-NEXT: ret <16 x i8> [[A]]
155+
;
156+
%tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
157+
ret <16 x i8> %tbx
158+
}
159+
160+
; tbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources).
161+
define <16 x i8> @tbx2_with_oob_bail(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b) {
162+
; CHECK-LABEL: @tbx2_with_oob_bail(
163+
; CHECK-NEXT: [[TBX:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
164+
; CHECK-NEXT: ret <16 x i8> [[TBX]]
165+
;
166+
%tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
167+
ret <16 x i8> %tbx
168+
}
169+
170+
; tbx1 with all OOB indices should optimize to fallback.
171+
define <16 x i8> @tbx1_all_oob(<16 x i8> %fallback, <16 x i8> %a) {
172+
; CHECK-LABEL: @tbx1_all_oob(
173+
; CHECK-NEXT: [[FALLBACK:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK1:%.*]], <16 x i8> [[A:%.*]], <16 x i8> splat (i8 99))
174+
; CHECK-NEXT: ret <16 x i8> [[FALLBACK]]
175+
;
176+
%tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
177+
ret <16 x i8> %tbx
178+
}
179+
180+
; tbx1 with OOB and mismatched fallback/source sizes should NOT optimize.
181+
define <8 x i8> @tbx1_fallback_size_mismatch(<8 x i8> %fallback, <16 x i8> %a) {
182+
; CHECK-LABEL: @tbx1_fallback_size_mismatch(
183+
; CHECK-NEXT: [[TBX:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>)
184+
; CHECK-NEXT: ret <8 x i8> [[TBX]]
185+
;
186+
%tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>)
187+
ret <8 x i8> %tbx
188+
}
189+
190+
; tbx1 with no OOB and mismatched fallback/source sizes should optimize.
191+
define <8 x i8> @tbx1_fallback_size_mismatch_no_oob(<8 x i8> %fallback, <16 x i8> %a) {
192+
; CHECK-LABEL: @tbx1_fallback_size_mismatch_no_oob(
193+
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
194+
; CHECK-NEXT: ret <8 x i8> [[TMP1]]
195+
;
196+
%tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
197+
ret <8 x i8> %tbx
198+
}
199+
200+
; tbl1 with non-i8 element type should NOT optimize.
201+
define <8 x i16> @tbl1_8x16(<16 x i8> %vec) {
202+
; CHECK-LABEL: @tbl1_8x16(
203+
; CHECK-NEXT: entry:
204+
; CHECK-NEXT: [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
205+
; CHECK-NEXT: ret <8 x i16> [[TBL1]]
206+
;
207+
entry:
208+
%tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
209+
ret <8 x i16> %tbl1
210+
}
211+
declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>)
212+
213+
; tbl1 with non-8/16 element count should NOT optimize.
214+
define <12 x i8> @tbl1_16x8(<16 x i8> %vec) {
215+
; CHECK-LABEL: @tbl1_16x8(
216+
; CHECK-NEXT: entry:
217+
; CHECK-NEXT: [[TBL1:%.*]] = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> [[VEC:%.*]], <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
218+
; CHECK-NEXT: ret <12 x i8> [[TBL1]]
219+
;
220+
entry:
221+
%tbl1 = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> %vec, <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
222+
ret <12 x i8> %tbl1
223+
}
224+
declare <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8>, <12 x i8>)
225+
226+
; Non-constant mask should NOT optimize.
227+
define <16 x i8> @tbl1_non_constant_mask(<16 x i8> %a, <16 x i8> %mask) {
228+
; CHECK-LABEL: @tbl1_non_constant_mask(
229+
; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[MASK:%.*]])
230+
; CHECK-NEXT: ret <16 x i8> [[TBL]]
231+
;
232+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %mask)
233+
ret <16 x i8> %tbl
234+
}
235+
236+
; Mask with some poison elements should optimize, with poison propagating to output.
237+
define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) {
238+
; CHECK-LABEL: @tbl1_poison_mask_elements(
239+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
240+
; CHECK-NEXT: ret <16 x i8> [[TMP1]]
241+
;
242+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
243+
ret <16 x i8> %tbl
244+
}
245+
246+
; Mask with all poison elements should optimize to poison.
247+
define <16 x i8> @tbl1_all_poison_mask(<16 x i8> %a) {
248+
; CHECK-LABEL: @tbl1_all_poison_mask(
249+
; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> poison)
250+
; CHECK-NEXT: ret <16 x i8> [[TBL]]
251+
;
252+
%tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> poison)
253+
ret <16 x i8> %tbl
254+
}
255+
256+
; "Real" declarations
257+
declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
258+
declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
259+
declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
260+
declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
261+
declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
262+
declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
263+
declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
264+
declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
265+
declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
266+
declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
267+
declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
268+
declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
269+
declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
270+
declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
271+
declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
272+
declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone

0 commit comments

Comments
 (0)