11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
22; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefix CHECK-SVE
3- ; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1
3+ ; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1-SME2 -check-prefix CHECK-SVE2p1
4+ ; RUN: llc -mattr=+sve -mattr=+sme2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-SVE2p1-SME2 -check-prefix CHECK-SME2
45target triple = "aarch64-linux"
56
67; Test combining of getActiveLaneMask with a pair of extract_vector operations.
@@ -13,12 +14,12 @@ define void @test_2x8bit_mask_with_32bit_index_and_trip_count(i32 %i, i32 %n) #0
1314; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
1415; CHECK-SVE-NEXT: b use
1516;
16- ; CHECK-SVE2p1-LABEL: test_2x8bit_mask_with_32bit_index_and_trip_count:
17- ; CHECK-SVE2p1: // %bb.0:
18- ; CHECK-SVE2p1-NEXT: mov w8, w1
19- ; CHECK-SVE2p1-NEXT: mov w9, w0
20- ; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x9, x8
21- ; CHECK-SVE2p1-NEXT: b use
17+ ; CHECK-SVE2p1-SME2- LABEL: test_2x8bit_mask_with_32bit_index_and_trip_count:
18+ ; CHECK-SVE2p1-SME2 : // %bb.0:
19+ ; CHECK-SVE2p1-SME2- NEXT: mov w8, w1
20+ ; CHECK-SVE2p1-SME2- NEXT: mov w9, w0
21+ ; CHECK-SVE2p1-SME2- NEXT: whilelo { p0.h, p1.h }, x9, x8
22+ ; CHECK-SVE2p1-SME2- NEXT: b use
2223 %r = call <vscale x 16 x i1 > @llvm.get.active.lane.mask.nxv16i1.i32 (i32 %i , i32 %n )
2324 %v0 = call <vscale x 8 x i1 > @llvm.vector.extract.nxv8i1.nxv16i1.i64 (<vscale x 16 x i1 > %r , i64 0 )
2425 %v1 = call <vscale x 8 x i1 > @llvm.vector.extract.nxv8i1.nxv16i1.i64 (<vscale x 16 x i1 > %r , i64 8 )
@@ -34,10 +35,10 @@ define void @test_2x8bit_mask_with_64bit_index_and_trip_count(i64 %i, i64 %n) #0
3435; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
3536; CHECK-SVE-NEXT: b use
3637;
37- ; CHECK-SVE2p1-LABEL: test_2x8bit_mask_with_64bit_index_and_trip_count:
38- ; CHECK-SVE2p1: // %bb.0:
39- ; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x0, x1
40- ; CHECK-SVE2p1-NEXT: b use
38+ ; CHECK-SVE2p1-SME2- LABEL: test_2x8bit_mask_with_64bit_index_and_trip_count:
39+ ; CHECK-SVE2p1-SME2 : // %bb.0:
40+ ; CHECK-SVE2p1-SME2- NEXT: whilelo { p0.h, p1.h }, x0, x1
41+ ; CHECK-SVE2p1-SME2- NEXT: b use
4142 %r = call <vscale x 16 x i1 > @llvm.get.active.lane.mask.nxv16i1.i64 (i64 %i , i64 %n )
4243 %v0 = call <vscale x 8 x i1 > @llvm.vector.extract.nxv8i1.nxv16i1.i64 (<vscale x 16 x i1 > %r , i64 0 )
4344 %v1 = call <vscale x 8 x i1 > @llvm.vector.extract.nxv8i1.nxv16i1.i64 (<vscale x 16 x i1 > %r , i64 8 )
@@ -53,12 +54,12 @@ define void @test_edge_case_2x1bit_mask(i64 %i, i64 %n) #0 {
5354; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
5455; CHECK-SVE-NEXT: b use
5556;
56- ; CHECK-SVE2p1-LABEL: test_edge_case_2x1bit_mask:
57- ; CHECK-SVE2p1: // %bb.0:
58- ; CHECK-SVE2p1-NEXT: whilelo p1.d, x0, x1
59- ; CHECK-SVE2p1-NEXT: punpklo p0.h, p1.b
60- ; CHECK-SVE2p1-NEXT: punpkhi p1.h, p1.b
61- ; CHECK-SVE2p1-NEXT: b use
57+ ; CHECK-SVE2p1-SME2- LABEL: test_edge_case_2x1bit_mask:
58+ ; CHECK-SVE2p1-SME2 : // %bb.0:
59+ ; CHECK-SVE2p1-SME2- NEXT: whilelo p1.d, x0, x1
60+ ; CHECK-SVE2p1-SME2- NEXT: punpklo p0.h, p1.b
61+ ; CHECK-SVE2p1-SME2- NEXT: punpkhi p1.h, p1.b
62+ ; CHECK-SVE2p1-SME2- NEXT: b use
6263 %r = call <vscale x 2 x i1 > @llvm.get.active.lane.mask.nxv2i1.i64 (i64 %i , i64 %n )
6364 %v0 = call <vscale x 1 x i1 > @llvm.vector.extract.nxv1i1.nxv2i1.i64 (<vscale x 2 x i1 > %r , i64 0 )
6465 %v1 = call <vscale x 1 x i1 > @llvm.vector.extract.nxv1i1.nxv2i1.i64 (<vscale x 2 x i1 > %r , i64 1 )
@@ -74,10 +75,10 @@ define void @test_boring_case_2x2bit_mask(i64 %i, i64 %n) #0 {
7475; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
7576; CHECK-SVE-NEXT: b use
7677;
77- ; CHECK-SVE2p1-LABEL: test_boring_case_2x2bit_mask:
78- ; CHECK-SVE2p1: // %bb.0:
79- ; CHECK-SVE2p1-NEXT: whilelo { p0.d, p1.d }, x0, x1
80- ; CHECK-SVE2p1-NEXT: b use
78+ ; CHECK-SVE2p1-SME2- LABEL: test_boring_case_2x2bit_mask:
79+ ; CHECK-SVE2p1-SME2 : // %bb.0:
80+ ; CHECK-SVE2p1-SME2- NEXT: whilelo { p0.d, p1.d }, x0, x1
81+ ; CHECK-SVE2p1-SME2- NEXT: b use
8182 %r = call <vscale x 4 x i1 > @llvm.get.active.lane.mask.nxv4i1.i64 (i64 %i , i64 %n )
8283 %v0 = call <vscale x 2 x i1 > @llvm.vector.extract.nxv2i1.nxv4i1.i64 (<vscale x 4 x i1 > %r , i64 0 )
8384 %v1 = call <vscale x 2 x i1 > @llvm.vector.extract.nxv2i1.nxv4i1.i64 (<vscale x 4 x i1 > %r , i64 2 )
@@ -96,14 +97,14 @@ define void @test_partial_extract(i64 %i, i64 %n) #0 {
9697; CHECK-SVE-NEXT: punpklo p1.h, p2.b
9798; CHECK-SVE-NEXT: b use
9899;
99- ; CHECK-SVE2p1-LABEL: test_partial_extract:
100- ; CHECK-SVE2p1: // %bb.0:
101- ; CHECK-SVE2p1-NEXT: whilelo p0.h, x0, x1
102- ; CHECK-SVE2p1-NEXT: punpklo p1.h, p0.b
103- ; CHECK-SVE2p1-NEXT: punpkhi p2.h, p0.b
104- ; CHECK-SVE2p1-NEXT: punpklo p0.h, p1.b
105- ; CHECK-SVE2p1-NEXT: punpklo p1.h, p2.b
106- ; CHECK-SVE2p1-NEXT: b use
100+ ; CHECK-SVE2p1-SME2- LABEL: test_partial_extract:
101+ ; CHECK-SVE2p1-SME2 : // %bb.0:
102+ ; CHECK-SVE2p1-SME2- NEXT: whilelo p0.h, x0, x1
103+ ; CHECK-SVE2p1-SME2- NEXT: punpklo p1.h, p0.b
104+ ; CHECK-SVE2p1-SME2- NEXT: punpkhi p2.h, p0.b
105+ ; CHECK-SVE2p1-SME2- NEXT: punpklo p0.h, p1.b
106+ ; CHECK-SVE2p1-SME2- NEXT: punpklo p1.h, p2.b
107+ ; CHECK-SVE2p1-SME2- NEXT: b use
107108 %r = call <vscale x 8 x i1 > @llvm.get.active.lane.mask.nxv8i1.i64 (i64 %i , i64 %n )
108109 %v0 = call <vscale x 2 x i1 > @llvm.vector.extract.nxv2i1.nxv8i1.i64 (<vscale x 8 x i1 > %r , i64 0 )
109110 %v1 = call <vscale x 2 x i1 > @llvm.vector.extract.nxv2i1.nxv8i1.i64 (<vscale x 8 x i1 > %r , i64 4 )
@@ -144,6 +145,21 @@ define void @test_fixed_extract(i64 %i, i64 %n) #0 {
144145; CHECK-SVE2p1-NEXT: mov v1.s[1], w11
145146; CHECK-SVE2p1-NEXT: // kill: def $d1 killed $d1 killed $q1
146147; CHECK-SVE2p1-NEXT: b use
148+ ;
149+ ; CHECK-SME2-LABEL: test_fixed_extract:
150+ ; CHECK-SME2: // %bb.0:
151+ ; CHECK-SME2-NEXT: whilelo p0.h, x0, x1
152+ ; CHECK-SME2-NEXT: cset w8, mi
153+ ; CHECK-SME2-NEXT: mov z0.h, p0/z, #1 // =0x1
154+ ; CHECK-SME2-NEXT: mov z1.h, z0.h[1]
155+ ; CHECK-SME2-NEXT: mov z2.h, z0.h[5]
156+ ; CHECK-SME2-NEXT: mov z3.h, z0.h[4]
157+ ; CHECK-SME2-NEXT: fmov s0, w8
158+ ; CHECK-SME2-NEXT: zip1 z0.s, z0.s, z1.s
159+ ; CHECK-SME2-NEXT: zip1 z1.s, z3.s, z2.s
160+ ; CHECK-SME2-NEXT: // kill: def $d0 killed $d0 killed $z0
161+ ; CHECK-SME2-NEXT: // kill: def $d1 killed $d1 killed $z1
162+ ; CHECK-SME2-NEXT: b use
147163 %r = call <vscale x 8 x i1 > @llvm.get.active.lane.mask.nxv8i1.i64 (i64 %i , i64 %n )
148164 %v0 = call <2 x i1 > @llvm.vector.extract.v2i1.nxv8i1.i64 (<vscale x 8 x i1 > %r , i64 0 )
149165 %v1 = call <2 x i1 > @llvm.vector.extract.v2i1.nxv8i1.i64 (<vscale x 8 x i1 > %r , i64 4 )
@@ -163,12 +179,12 @@ define void @test_2x16bit_mask_with_32bit_index_and_trip_count(i32 %i, i32 %n) #
163179; CHECK-SVE-NEXT: whilelo p1.b, w8, w1
164180; CHECK-SVE-NEXT: b use
165181;
166- ; CHECK-SVE2p1-LABEL: test_2x16bit_mask_with_32bit_index_and_trip_count:
167- ; CHECK-SVE2p1: // %bb.0:
168- ; CHECK-SVE2p1-NEXT: mov w8, w1
169- ; CHECK-SVE2p1-NEXT: mov w9, w0
170- ; CHECK-SVE2p1-NEXT: whilelo { p0.b, p1.b }, x9, x8
171- ; CHECK-SVE2p1-NEXT: b use
182+ ; CHECK-SVE2p1-SME2- LABEL: test_2x16bit_mask_with_32bit_index_and_trip_count:
183+ ; CHECK-SVE2p1-SME2 : // %bb.0:
184+ ; CHECK-SVE2p1-SME2- NEXT: mov w8, w1
185+ ; CHECK-SVE2p1-SME2- NEXT: mov w9, w0
186+ ; CHECK-SVE2p1-SME2- NEXT: whilelo { p0.b, p1.b }, x9, x8
187+ ; CHECK-SVE2p1-SME2- NEXT: b use
172188 %r = call <vscale x 32 x i1 > @llvm.get.active.lane.mask.nxv32i1.i32 (i32 %i , i32 %n )
173189 %v0 = call <vscale x 16 x i1 > @llvm.vector.extract.nxv16i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 0 )
174190 %v1 = call <vscale x 16 x i1 > @llvm.vector.extract.nxv16i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 16 )
@@ -193,16 +209,16 @@ define void @test_2x32bit_mask_with_32bit_index_and_trip_count(i32 %i, i32 %n) #
193209; CHECK-SVE-NEXT: whilelo p2.b, w8, w1
194210; CHECK-SVE-NEXT: b use
195211;
196- ; CHECK-SVE2p1-LABEL: test_2x32bit_mask_with_32bit_index_and_trip_count:
197- ; CHECK-SVE2p1: // %bb.0:
198- ; CHECK-SVE2p1-NEXT: rdvl x8, #2
199- ; CHECK-SVE2p1-NEXT: mov w9, w1
200- ; CHECK-SVE2p1-NEXT: mov w10, w0
201- ; CHECK-SVE2p1-NEXT: adds w8, w0, w8
202- ; CHECK-SVE2p1-NEXT: csinv w8, w8, wzr, lo
203- ; CHECK-SVE2p1-NEXT: whilelo { p0.b, p1.b }, x10, x9
204- ; CHECK-SVE2p1-NEXT: whilelo { p2.b, p3.b }, x8, x9
205- ; CHECK-SVE2p1-NEXT: b use
212+ ; CHECK-SVE2p1-SME2- LABEL: test_2x32bit_mask_with_32bit_index_and_trip_count:
213+ ; CHECK-SVE2p1-SME2 : // %bb.0:
214+ ; CHECK-SVE2p1-SME2- NEXT: rdvl x8, #2
215+ ; CHECK-SVE2p1-SME2- NEXT: mov w9, w1
216+ ; CHECK-SVE2p1-SME2- NEXT: mov w10, w0
217+ ; CHECK-SVE2p1-SME2- NEXT: adds w8, w0, w8
218+ ; CHECK-SVE2p1-SME2- NEXT: csinv w8, w8, wzr, lo
219+ ; CHECK-SVE2p1-SME2- NEXT: whilelo { p0.b, p1.b }, x10, x9
220+ ; CHECK-SVE2p1-SME2- NEXT: whilelo { p2.b, p3.b }, x8, x9
221+ ; CHECK-SVE2p1-SME2- NEXT: b use
206222 %r = call <vscale x 64 x i1 > @llvm.get.active.lane.mask.nxv16i1.i32 (i32 %i , i32 %n )
207223 %v0 = call <vscale x 16 x i1 > @llvm.vector.extract.nxv16i1.nxv64i1.i64 (<vscale x 64 x i1 > %r , i64 0 )
208224 %v1 = call <vscale x 16 x i1 > @llvm.vector.extract.nxv16i1.nxv64i1.i64 (<vscale x 64 x i1 > %r , i64 16 )
@@ -212,93 +228,6 @@ define void @test_2x32bit_mask_with_32bit_index_and_trip_count(i32 %i, i32 %n) #
212228 ret void
213229}
214230
215- define void @test_2x16bit_mask_with_32bit_index_and_trip_count_ext8 (i32 %i , i32 %n ) #0 {
216- ; CHECK-SVE-LABEL: test_2x16bit_mask_with_32bit_index_and_trip_count_ext8:
217- ; CHECK-SVE: // %bb.0:
218- ; CHECK-SVE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
219- ; CHECK-SVE-NEXT: rdvl x8, #1
220- ; CHECK-SVE-NEXT: adds w8, w0, w8
221- ; CHECK-SVE-NEXT: csinv w8, w8, wzr, lo
222- ; CHECK-SVE-NEXT: whilelo p0.b, w0, w1
223- ; CHECK-SVE-NEXT: whilelo p4.b, w8, w1
224- ; CHECK-SVE-NEXT: punpklo p1.h, p0.b
225- ; CHECK-SVE-NEXT: punpkhi p3.h, p0.b
226- ; CHECK-SVE-NEXT: punpklo p0.h, p1.b
227- ; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
228- ; CHECK-SVE-NEXT: punpklo p2.h, p3.b
229- ; CHECK-SVE-NEXT: punpkhi p3.h, p3.b
230- ; CHECK-SVE-NEXT: bl use
231- ; CHECK-SVE-NEXT: punpklo p1.h, p4.b
232- ; CHECK-SVE-NEXT: punpkhi p3.h, p4.b
233- ; CHECK-SVE-NEXT: punpklo p0.h, p1.b
234- ; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
235- ; CHECK-SVE-NEXT: punpklo p2.h, p3.b
236- ; CHECK-SVE-NEXT: punpkhi p3.h, p3.b
237- ; CHECK-SVE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
238- ; CHECK-SVE-NEXT: b use
239- ;
240- ; CHECK-SVE2p1-LABEL: test_2x16bit_mask_with_32bit_index_and_trip_count_ext8:
241- ; CHECK-SVE2p1: // %bb.0:
242- ; CHECK-SVE2p1-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
243- ; CHECK-SVE2p1-NEXT: mov w8, w1
244- ; CHECK-SVE2p1-NEXT: mov w9, w0
245- ; CHECK-SVE2p1-NEXT: whilelo { p4.b, p5.b }, x9, x8
246- ; CHECK-SVE2p1-NEXT: punpklo p1.h, p4.b
247- ; CHECK-SVE2p1-NEXT: punpkhi p3.h, p4.b
248- ; CHECK-SVE2p1-NEXT: punpklo p0.h, p1.b
249- ; CHECK-SVE2p1-NEXT: punpkhi p1.h, p1.b
250- ; CHECK-SVE2p1-NEXT: punpklo p2.h, p3.b
251- ; CHECK-SVE2p1-NEXT: punpkhi p3.h, p3.b
252- ; CHECK-SVE2p1-NEXT: bl use
253- ; CHECK-SVE2p1-NEXT: punpklo p1.h, p5.b
254- ; CHECK-SVE2p1-NEXT: punpkhi p3.h, p5.b
255- ; CHECK-SVE2p1-NEXT: punpklo p0.h, p1.b
256- ; CHECK-SVE2p1-NEXT: punpkhi p1.h, p1.b
257- ; CHECK-SVE2p1-NEXT: punpklo p2.h, p3.b
258- ; CHECK-SVE2p1-NEXT: punpkhi p3.h, p3.b
259- ; CHECK-SVE2p1-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
260- ; CHECK-SVE2p1-NEXT: b use
261- %r = call <vscale x 32 x i1 > @llvm.get.active.lane.mask.nxv32i1.i32 (i32 %i , i32 %n )
262- %v0 = call <vscale x 4 x i1 > @llvm.vector.extract.nxv4i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 0 )
263- %v1 = call <vscale x 4 x i1 > @llvm.vector.extract.nxv4i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 4 )
264- %v2 = call <vscale x 4 x i1 > @llvm.vector.extract.nxv4i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 8 )
265- %v3 = call <vscale x 4 x i1 > @llvm.vector.extract.nxv4i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 12 )
266- tail call void @use (<vscale x 4 x i1 > %v0 , <vscale x 4 x i1 > %v1 , <vscale x 4 x i1 > %v2 , <vscale x 4 x i1 > %v3 )
267- %v4 = call <vscale x 4 x i1 > @llvm.vector.extract.nxv4i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 16 )
268- %v5 = call <vscale x 4 x i1 > @llvm.vector.extract.nxv4i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 20 )
269- %v6 = call <vscale x 4 x i1 > @llvm.vector.extract.nxv4i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 24 )
270- %v7 = call <vscale x 4 x i1 > @llvm.vector.extract.nxv4i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 28 )
271- tail call void @use (<vscale x 4 x i1 > %v4 , <vscale x 4 x i1 > %v5 , <vscale x 4 x i1 > %v6 , <vscale x 4 x i1 > %v7 )
272- ret void
273- }
274-
275- define void @test_2x16bit_mask_with_32bit_index_and_trip_count_part_extracts (i32 %i , i32 %n ) #0 {
276- ; CHECK-SVE-LABEL: test_2x16bit_mask_with_32bit_index_and_trip_count_part_extracts:
277- ; CHECK-SVE: // %bb.0:
278- ; CHECK-SVE-NEXT: rdvl x8, #1
279- ; CHECK-SVE-NEXT: adds w8, w0, w8
280- ; CHECK-SVE-NEXT: csinv w8, w8, wzr, lo
281- ; CHECK-SVE-NEXT: whilelo p0.b, w0, w1
282- ; CHECK-SVE-NEXT: whilelo p1.b, w8, w1
283- ; CHECK-SVE-NEXT: punpkhi p0.h, p0.b
284- ; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
285- ; CHECK-SVE-NEXT: b use
286- ;
287- ; CHECK-SVE2p1-LABEL: test_2x16bit_mask_with_32bit_index_and_trip_count_part_extracts:
288- ; CHECK-SVE2p1: // %bb.0:
289- ; CHECK-SVE2p1-NEXT: mov w8, w1
290- ; CHECK-SVE2p1-NEXT: mov w9, w0
291- ; CHECK-SVE2p1-NEXT: whilelo { p2.b, p3.b }, x9, x8
292- ; CHECK-SVE2p1-NEXT: punpkhi p0.h, p2.b
293- ; CHECK-SVE2p1-NEXT: punpkhi p1.h, p3.b
294- ; CHECK-SVE2p1-NEXT: b use
295- %r = call <vscale x 32 x i1 > @llvm.get.active.lane.mask.nxv32i1.i32 (i32 %i , i32 %n )
296- %v0 = call <vscale x 8 x i1 > @llvm.vector.extract.nxv8i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 8 )
297- %v1 = call <vscale x 8 x i1 > @llvm.vector.extract.nxv8i1.nxv32i1.i64 (<vscale x 32 x i1 > %r , i64 24 )
298- tail call void @use (<vscale x 8 x i1 > %v0 , <vscale x 8 x i1 > %v1 )
299- ret void
300- }
301-
302231declare void @use (...)
303232
304233attributes #0 = { nounwind }
0 commit comments