Skip to content

Commit ef3c36b

Browse files
committed
[AArch64][SVE] Allow factors other than 2/4 for load+deinterleave3+store
patterns for codegen Resolves #159801 and #162068
1 parent 46ad540 commit ef3c36b

File tree

3 files changed

+186
-2
lines changed

3 files changed

+186
-2
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17973,7 +17973,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
1797317973
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
1797417974
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
1797517975
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
17976-
if (Factor != 2 && Factor != 4) {
17976+
if (Factor != 2 && Factor != 3 && Factor != 4) {
1797717977
LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
1797817978
return false;
1797917979
}
@@ -18052,7 +18052,7 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
1805218052
Instruction *Store, Value *Mask,
1805318053
ArrayRef<Value *> InterleavedValues) const {
1805418054
unsigned Factor = InterleavedValues.size();
18055-
if (Factor != 2 && Factor != 4) {
18055+
if (Factor != 2 && Factor != 3 && Factor != 4) {
1805618056
LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
1805718057
return false;
1805818058
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=aarch64-none-elf -mattr=+sve | FileCheck %s -check-prefixes=SVE
3+
4+
define void @load_factor2(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2) {
5+
; SVE-LABEL: load_factor2:
6+
; SVE: // %bb.0:
7+
; SVE-NEXT: ptrue p0.s
8+
; SVE-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0]
9+
; SVE-NEXT: str z0, [x1]
10+
; SVE-NEXT: str z1, [x2]
11+
; SVE-NEXT: ret
12+
%wide.vec = load <vscale x 8 x i32>, ptr %ptr, align 8
13+
%ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.vec)
14+
15+
%3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
16+
%4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
17+
18+
store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
19+
store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
20+
ret void
21+
}
22+
23+
define void @load_factor3(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2, <vscale x 4 x i32>* %s3) {
24+
; SVE-LABEL: load_factor3:
25+
; SVE: // %bb.0:
26+
; SVE-NEXT: ptrue p0.s
27+
; SVE-NEXT: ld3w { z0.s - z2.s }, p0/z, [x0]
28+
; SVE-NEXT: str z0, [x1]
29+
; SVE-NEXT: str z1, [x2]
30+
; SVE-NEXT: str z2, [x3]
31+
; SVE-NEXT: ret
32+
%wide.vec = load <vscale x 12 x i32>, ptr %ptr, align 8
33+
%ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> %wide.vec)
34+
35+
%3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
36+
%4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
37+
%5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 2
38+
39+
store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
40+
store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
41+
store <vscale x 4 x i32> %5, <vscale x 4 x i32>* %s3
42+
ret void
43+
}
44+
45+
define void @load_factor4(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2, <vscale x 4 x i32>* %s3, <vscale x 4 x i32>* %s4) {
46+
; SVE-LABEL: load_factor4:
47+
; SVE: // %bb.0:
48+
; SVE-NEXT: ptrue p0.s
49+
; SVE-NEXT: ld4w { z0.s - z3.s }, p0/z, [x0]
50+
; SVE-NEXT: str z0, [x1]
51+
; SVE-NEXT: str z1, [x2]
52+
; SVE-NEXT: str z2, [x3]
53+
; SVE-NEXT: str z3, [x4]
54+
; SVE-NEXT: ret
55+
%wide.vec = load <vscale x 16 x i32>, ptr %ptr, align 8
56+
%ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %wide.vec)
57+
58+
%3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
59+
%4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
60+
%5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 2
61+
%6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 3
62+
63+
store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
64+
store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
65+
store <vscale x 4 x i32> %5, <vscale x 4 x i32>* %s3
66+
store <vscale x 4 x i32> %6, <vscale x 4 x i32>* %s4
67+
ret void
68+
}
69+
70+
71+
declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
72+
declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32>)
73+
declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32>)
74+

llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,110 @@ define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l
289289
ret void
290290
}
291291

292+
define void @deinterleave_nxi64_factor3(i32* %ptr, <vscale x 1 x i64>* %s1, <vscale x 1 x i64>* %s2, <vscale x 1 x i64>* %s3) {
293+
; CHECK-LABEL: define void @deinterleave_nxi64_factor3
294+
; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) {
295+
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 3 x i64>, ptr [[PTR]], align 8
296+
; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> [[WIDE_VEC]])
297+
; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 0
298+
; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 1
299+
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 2
300+
; CHECK-NEXT: store <vscale x 1 x i64> [[TMP1]], ptr [[S1]], align 8
301+
; CHECK-NEXT: store <vscale x 1 x i64> [[TMP2]], ptr [[S2]], align 8
302+
; CHECK-NEXT: store <vscale x 1 x i64> [[TMP3]], ptr [[S3]], align 8
303+
; CHECK-NEXT: ret void
304+
;
305+
%wide.vec = load <vscale x 3 x i64>, ptr %ptr, align 8
306+
%ldN = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> %wide.vec)
307+
308+
%3 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 0
309+
%4 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 1
310+
%5 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 2
311+
312+
store <vscale x 1 x i64> %3, <vscale x 1 x i64>* %s1
313+
store <vscale x 1 x i64> %4, <vscale x 1 x i64>* %s2
314+
store <vscale x 1 x i64> %5, <vscale x 1 x i64>* %s3
315+
ret void
316+
}
317+
318+
define void @deinterleave_nxi32_factor3(i32* %ptr, <vscale x 2 x i32>* %s1, <vscale x 2 x i32>* %s2, <vscale x 2 x i32>* %s3) {
319+
; CHECK-LABEL: define void @deinterleave_nxi32_factor3
320+
; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) {
321+
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 6 x i32>, ptr [[PTR]], align 8
322+
; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3.nxv6i32(<vscale x 6 x i32> [[WIDE_VEC]])
323+
; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[LDN]], 0
324+
; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[LDN]], 1
325+
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[LDN]], 2
326+
; CHECK-NEXT: store <vscale x 2 x i32> [[TMP1]], ptr [[S1]], align 8
327+
; CHECK-NEXT: store <vscale x 2 x i32> [[TMP2]], ptr [[S2]], align 8
328+
; CHECK-NEXT: store <vscale x 2 x i32> [[TMP3]], ptr [[S3]], align 8
329+
; CHECK-NEXT: ret void
330+
;
331+
%wide.vec = load <vscale x 6 x i32>, ptr %ptr, align 8
332+
%ldN = tail call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3.nxv3i32(<vscale x 6 x i32> %wide.vec)
333+
334+
%3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %ldN, 0
335+
%4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %ldN, 1
336+
%5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %ldN, 2
337+
338+
store <vscale x 2 x i32> %3, <vscale x 2 x i32>* %s1
339+
store <vscale x 2 x i32> %4, <vscale x 2 x i32>* %s2
340+
store <vscale x 2 x i32> %5, <vscale x 2 x i32>* %s3
341+
ret void
342+
}
343+
344+
define void @deinterleave_nxi16_factor3(i32* %ptr, <vscale x 4 x i16>* %s1, <vscale x 4 x i16>* %s2, <vscale x 4 x i16>* %s3) {
345+
; CHECK-LABEL: define void @deinterleave_nxi16_factor3
346+
; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) {
347+
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 12 x i16>, ptr [[PTR]], align 8
348+
; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave3.nxv12i16(<vscale x 12 x i16> [[WIDE_VEC]])
349+
; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[LDN]], 0
350+
; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[LDN]], 1
351+
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[LDN]], 2
352+
; CHECK-NEXT: store <vscale x 4 x i16> [[TMP1]], ptr [[S1]], align 8
353+
; CHECK-NEXT: store <vscale x 4 x i16> [[TMP2]], ptr [[S2]], align 8
354+
; CHECK-NEXT: store <vscale x 4 x i16> [[TMP3]], ptr [[S3]], align 8
355+
; CHECK-NEXT: ret void
356+
;
357+
%wide.vec = load <vscale x 12 x i16>, ptr %ptr, align 8
358+
%ldN = tail call { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave3.nxv3i16(<vscale x 12 x i16> %wide.vec)
359+
360+
%3 = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } %ldN, 0
361+
%4 = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } %ldN, 1
362+
%5 = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } %ldN, 2
363+
364+
store <vscale x 4 x i16> %3, <vscale x 4 x i16>* %s1
365+
store <vscale x 4 x i16> %4, <vscale x 4 x i16>* %s2
366+
store <vscale x 4 x i16> %5, <vscale x 4 x i16>* %s3
367+
ret void
368+
}
369+
370+
define void @deinterleave_nxi8_factor3(i32* %ptr, <vscale x 8 x i8>* %s1, <vscale x 8 x i8>* %s2, <vscale x 8 x i8>* %s3) {
371+
; CHECK-LABEL: define void @deinterleave_nxi8_factor3
372+
; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) {
373+
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 24 x i8>, ptr [[PTR]], align 8
374+
; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8> [[WIDE_VEC]])
375+
; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 0
376+
; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 1
377+
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 2
378+
; CHECK-NEXT: store <vscale x 8 x i8> [[TMP1]], ptr [[S1]], align 8
379+
; CHECK-NEXT: store <vscale x 8 x i8> [[TMP2]], ptr [[S2]], align 8
380+
; CHECK-NEXT: store <vscale x 8 x i8> [[TMP3]], ptr [[S3]], align 8
381+
; CHECK-NEXT: ret void
382+
;
383+
%wide.vec = load <vscale x 24 x i8>, ptr %ptr, align 8
384+
%ldN = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv3i8(<vscale x 24 x i8> %wide.vec)
385+
386+
%3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 0
387+
%4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 1
388+
%5 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 2
389+
390+
store <vscale x 8 x i8> %3, <vscale x 8 x i8>* %s1
391+
store <vscale x 8 x i8> %4, <vscale x 8 x i8>* %s2
392+
store <vscale x 8 x i8> %5, <vscale x 8 x i8>* %s3
393+
ret void
394+
}
395+
292396
declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
293397
declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
294398
declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
@@ -312,4 +416,10 @@ declare <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <
312416
; Larger interleaves to test 'legalization'
313417
declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
314418

419+
; Interleaves with Factor=3
420+
declare { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64>)
421+
declare { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3.nxv3i32(<vscale x 6 x i32>)
422+
declare { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave3.nxv3i16(<vscale x 12 x i16>)
423+
declare { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv3i8(<vscale x 24 x i8>)
424+
315425
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }

0 commit comments

Comments
 (0)