-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AArch64][SVE] Allow factors other than 2/4 for load+deinterleave3+store patterns for codegen #162475
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: Rajveer Singh Bharadwaj (Rajveer100) ChangesResolves #159801 and #162068 Full diff: https://github.com/llvm/llvm-project/pull/162475.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 29d65d5d1db64..a41e3f73fd5b4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17973,7 +17973,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
- if (Factor != 2 && Factor != 4) {
+ if (Factor != 2 && Factor != 3 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
return false;
}
@@ -18052,7 +18052,7 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
Instruction *Store, Value *Mask,
ArrayRef<Value *> InterleavedValues) const {
unsigned Factor = InterleavedValues.size();
- if (Factor != 2 && Factor != 4) {
+ if (Factor != 2 && Factor != 3 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll
new file mode 100644
index 0000000000000..0d41dc9113978
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-elf -mattr=+sve | FileCheck %s -check-prefixes=SVE
+
+define void @load_factor2(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2) {
+; SVE-LABEL: load_factor2:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0]
+; SVE-NEXT: str z0, [x1]
+; SVE-NEXT: str z1, [x2]
+; SVE-NEXT: ret
+ %wide.vec = load <vscale x 8 x i32>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.vec)
+
+ %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
+ %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
+
+ store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
+ store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
+ ret void
+}
+
+define void @load_factor3(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2, <vscale x 4 x i32>* %s3) {
+; SVE-LABEL: load_factor3:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: ld3w { z0.s - z2.s }, p0/z, [x0]
+; SVE-NEXT: str z0, [x1]
+; SVE-NEXT: str z1, [x2]
+; SVE-NEXT: str z2, [x3]
+; SVE-NEXT: ret
+ %wide.vec = load <vscale x 12 x i32>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> %wide.vec)
+
+ %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
+ %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 2
+
+ store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
+ store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
+ store <vscale x 4 x i32> %5, <vscale x 4 x i32>* %s3
+ ret void
+}
+
+define void @load_factor4(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2, <vscale x 4 x i32>* %s3, <vscale x 4 x i32>* %s4) {
+; SVE-LABEL: load_factor4:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: ld4w { z0.s - z3.s }, p0/z, [x0]
+; SVE-NEXT: str z0, [x1]
+; SVE-NEXT: str z1, [x2]
+; SVE-NEXT: str z2, [x3]
+; SVE-NEXT: str z3, [x4]
+; SVE-NEXT: ret
+ %wide.vec = load <vscale x 16 x i32>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %wide.vec)
+
+ %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
+ %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 2
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 3
+
+ store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
+ store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
+ store <vscale x 4 x i32> %5, <vscale x 4 x i32>* %s3
+ store <vscale x 4 x i32> %6, <vscale x 4 x i32>* %s4
+ ret void
+}
+
+
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32>)
+
|
|
This doesn't actually fix the reported issues but rather hides them due to how their test cases are written. That's not to say this PR is not desirable because it is, I'd just rather keep the defects open until the underlying issues are resolved. |
|
Yeah, this only handles those particular cases as of now, rather than a generic one. |
7f56e0b to
ef3c36b
Compare
|
@paulwalker-arm |
llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
Show resolved
Hide resolved
ef3c36b to
d377eb6
Compare
|
@paulwalker-arm |
llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
Outdated
Show resolved
Hide resolved
d377eb6 to
4d4d51b
Compare
|
Should be done now :) |
|
@paulwalker-arm |
|
@paulwalker-arm |
patterns for codegen Resolves llvm#159801 and llvm#162068
4d4d51b to
7f511dc
Compare
Resolves #159801 and #162068