From 7f56e0b4ff283abedccc4257d66694432a51d49a Mon Sep 17 00:00:00 2001
From: Rajveer <rajveer.developer@icloud.com>
Date: Wed, 8 Oct 2025 18:00:53 +0530
Subject: [PATCH] [AArch64][SVE] Allow factors other than 2/4 for
 load+deinterleave3 patterns for codegen

Resolves #159801 and #162068
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  4 +-
 .../AArch64/sve-vector-load+deinterleave.ll   | 74 +++++++++++++++++++
 2 files changed, 76 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 29d65d5d1db64..a41e3f73fd5b4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17973,7 +17973,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
-  if (Factor != 2 && Factor != 4) {
+  if (Factor != 2 && Factor != 3 && Factor != 4) {
     LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
     return false;
   }
@@ -18052,7 +18052,7 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     Instruction *Store, Value *Mask,
     ArrayRef<Value *> InterleavedValues) const {
   unsigned Factor = InterleavedValues.size();
-  if (Factor != 2 && Factor != 4) {
+  if (Factor != 2 && Factor != 3 && Factor != 4) {
     LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
     return false;
   }
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll
new file mode 100644
index 0000000000000..0d41dc9113978
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-elf -mattr=+sve | FileCheck %s -check-prefixes=SVE
+
+define void @load_factor2(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2) {
+; SVE-LABEL: load_factor2:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    ld2w { z0.s, z1.s }, p0/z, [x0]
+; SVE-NEXT:    str z0, [x1]
+; SVE-NEXT:    str z1, [x2]
+; SVE-NEXT:    ret
+  %wide.vec = load <vscale x 8 x i32>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.vec)
+
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
+  %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
+
+  store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
+  store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
+  ret void
+}
+
+define void @load_factor3(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2, <vscale x 4 x i32>* %s3) {
+; SVE-LABEL: load_factor3:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    ld3w { z0.s - z2.s }, p0/z, [x0]
+; SVE-NEXT:    str z0, [x1]
+; SVE-NEXT:    str z1, [x2]
+; SVE-NEXT:    str z2, [x3]
+; SVE-NEXT:    ret
+  %wide.vec = load <vscale x 12 x i32>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> %wide.vec)
+
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
+  %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 2
+
+  store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
+  store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
+  store <vscale x 4 x i32> %5, <vscale x 4 x i32>* %s3
+  ret void
+}
+
+define void @load_factor4(i32* %ptr, <vscale x 4 x i32>* %s1, <vscale x 4 x i32>* %s2, <vscale x 4 x i32>* %s3, <vscale x 4 x i32>* %s4) {
+; SVE-LABEL: load_factor4:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    ld4w { z0.s - z3.s }, p0/z, [x0]
+; SVE-NEXT:    str z0, [x1]
+; SVE-NEXT:    str z1, [x2]
+; SVE-NEXT:    str z2, [x3]
+; SVE-NEXT:    str z3, [x4]
+; SVE-NEXT:    ret
+  %wide.vec = load <vscale x 16 x i32>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %wide.vec)
+
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 0
+  %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 1
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 2
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %ldN, 3
+
+  store <vscale x 4 x i32> %3, <vscale x 4 x i32>* %s1
+  store <vscale x 4 x i32> %4, <vscale x 4 x i32>* %s2
+  store <vscale x 4 x i32> %5, <vscale x 4 x i32>* %s3
+  store <vscale x 4 x i32> %6, <vscale x 4 x i32>* %s4
+  ret void
+}
+
+
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32>)
+