-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AArch64][SVE] Allow factors other than 2/4 for load+deinterleave3+store patterns for codegen #162475
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
patterns for codegen Resolves llvm#159801 and llvm#162068
@llvm/pr-subscribers-backend-aarch64 Author: Rajveer Singh Bharadwaj (Rajveer100) ChangesResolves #159801 and #162068 Full diff: https://github.com/llvm/llvm-project/pull/162475.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 29d65d5d1db64..a41e3f73fd5b4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17973,7 +17973,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
- if (Factor != 2 && Factor != 4) {
+ if (Factor != 2 && Factor != 3 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
return false;
}
@@ -18052,7 +18052,7 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
Instruction *Store, Value *Mask,
ArrayRef<Value *> InterleavedValues) const {
unsigned Factor = InterleavedValues.size();
- if (Factor != 2 && Factor != 4) {
+ if (Factor != 2 && Factor != 3 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll
new file mode 100644
index 0000000000000..0d41dc9113978
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-elf -mattr=+sve | FileCheck %s -check-prefixes=SVE
+
+define void @load_factor2(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2) {
+; SVE-LABEL: load_factor2:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0]
+; SVE-NEXT: str z0, [x1]
+; SVE-NEXT: str z1, [x2]
+; SVE-NEXT: ret
+ %wide.vec = load <vscale x 8 x i32>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.vec)
+
+ %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
+ %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
+
+ store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
+ store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
+ ret void
+}
+
+define void @load_factor3(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2, <vscale x 4 x i32>* %s3) {
+; SVE-LABEL: load_factor3:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: ld3w { z0.s - z2.s }, p0/z, [x0]
+; SVE-NEXT: str z0, [x1]
+; SVE-NEXT: str z1, [x2]
+; SVE-NEXT: str z2, [x3]
+; SVE-NEXT: ret
+ %wide.vec = load <vscale x 12 x i32>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> %wide.vec)
+
+ %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
+ %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 2
+
+ store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
+ store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
+ store <vscale x 4 x i32> %5, <vscale x 4 x i32>* %s3
+ ret void
+}
+
+define void @load_factor4(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2, <vscale x 4 x i32>* %s3, <vscale x 4 x i32>* %s4) {
+; SVE-LABEL: load_factor4:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: ld4w { z0.s - z3.s }, p0/z, [x0]
+; SVE-NEXT: str z0, [x1]
+; SVE-NEXT: str z1, [x2]
+; SVE-NEXT: str z2, [x3]
+; SVE-NEXT: str z3, [x4]
+; SVE-NEXT: ret
+ %wide.vec = load <vscale x 16 x i32>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %wide.vec)
+
+ %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
+ %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 2
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 3
+
+ store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
+ store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
+ store <vscale x 4 x i32> %5, <vscale x 4 x i32>* %s3
+ store <vscale x 4 x i32> %6, <vscale x 4 x i32>* %s4
+ ret void
+}
+
+
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32>)
+
|
This doesn't actually fix the reported issues but rather hides them due to how their test cases are written. That's not to say this PR is not desirable because it is, I'd just rather keep the defects open until the underlying issues are resolved. |
Yeah, this only handles those particular cases as of now, rather than a generic one. |
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
; RUN: llc < %s -mtriple=aarch64-none-elf -mattr=+sve | FileCheck %s -check-prefixes=SVE |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While the code change is in AArch64ISelLowering the affected function does not really relate to code generation. It's part of InterleavedAccessPass that is an IR pass. Its tests live in llvm/test/Transforms/InterleavedAccess/AArch64
so it'll be better to create a 3-way variant of scalable-deinterleave-intrinsics.ll
.
Resolves #159801 and #162068