From 4b7fce219a057f0d11241940ba6d528ad6c72a9f Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 12 Dec 2024 17:56:13 +0000
Subject: [PATCH 1/3] [AArch64][SVE] Don't require 16-byte aligned SVE
 loads/stores with +strict-align

Instead, allow any alignment >= the element size (in bytes). This is all
that is needed for vector loads even if unaligned accesses are disabled.

See: https://developer.arm.com/documentation/ddi0602/2024-09/Shared-Pseudocode/aarch64-functions-memory?lang=en#impl-aarch64.Mem.read.3

Specifically:
```
// Check alignment on size of element accessed, not overall access size.
constant integer alignment = if accdesc.ispair then size DIV 2 else size;
```

The `size` passed to `Mem` by SVE load/store instructions is the
element size.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 11 ++++
 .../AArch64/sve-load-store-strict-align.ll    | 58 +++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e61dedb247756..864e3cfcfeb56 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2569,6 +2569,17 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
     unsigned *Fast) const {
+
+  // Allow SVE loads/stores where the alignment >= the size of the element type,
+  // even with +strict-align. The SVE loads/stores do not require memory to be
+  // aligned more than the element type even without unaligned accesses.
+  // Without this, already aligned loads and stores are forced to have 16-byte
+  // alignment, which is unnecessary and fails to build as
+  // TLI.expandUnalignedLoad() and TLI.expandUnalignedStore() don't yet support
+  // scalable vectors.
+  if (VT.isScalableVector() && Alignment >= Align(VT.getScalarSizeInBits() / 8))
+    return true;
+
   if (Subtarget->requiresStrictAlign())
     return false;
 
diff --git a/llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll b/llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll
new file mode 100644
index 0000000000000..94e120b829424
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+strict-align < %s | FileCheck %s
+
+define void @nxv16i8(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %l3 = load <vscale x 16 x i8>, ptr %ldptr, align 1
+  store <vscale x 16 x i8> %l3, ptr %stptr, align 1
+  ret void
+}
+
+define void @nxv8i16(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %l3 = load <vscale x 8 x i16>, ptr %ldptr, align 2
+  store <vscale x 8 x i16> %l3, ptr %stptr, align 2
+  ret void
+}
+
+define void @nxv4i32(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %l3 = load <vscale x 4 x i32>, ptr %ldptr, align 4
+  store <vscale x 4 x i32> %l3, ptr %stptr, align 4
+  ret void
+}
+
+define void @nxv2i64(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %l3 = load <vscale x 2 x i64>, ptr %ldptr, align 8
+  store <vscale x 2 x i64> %l3, ptr %stptr, align 8
+  ret void
+}
+
+; FIXME: Support TLI.expandUnalignedLoad()/TLI.expandUnalignedStore() for SVE.
+; define void @unaligned_nxv2i64(ptr %ldptr, ptr %stptr) {
+;   %l3 = load <vscale x 2 x i64>, ptr %ldptr, align 4
+;   store <vscale x 2 x i64> %l3, ptr %stptr, align 4
+;   ret void
+; }

From 5065257c8f87602b70ecd3a70c848411822efe5e Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 13 Dec 2024 13:09:55 +0000
Subject: [PATCH 2/3] Fixups

---
 .../lib/Target/AArch64/AArch64ISelLowering.cpp | 18 ++++++++++--------
 .../AArch64/sve-load-store-strict-align.ll     | 16 ++++++++++------
 .../sve-unaligned-load-store-strict-align.ll   | 18 ++++++++++++++++++
 3 files changed, 38 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 864e3cfcfeb56..8ab5cb2902110 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2571,14 +2571,16 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
     unsigned *Fast) const {
 
   // Allow SVE loads/stores where the alignment >= the size of the element type,
-  // even with +strict-align. The SVE loads/stores do not require memory to be
-  // aligned more than the element type even without unaligned accesses.
-  // Without this, already aligned loads and stores are forced to have 16-byte
-  // alignment, which is unnecessary and fails to build as
-  // TLI.expandUnalignedLoad() and TLI.expandUnalignedStore() don't yet support
-  // scalable vectors.
-  if (VT.isScalableVector() && Alignment >= Align(VT.getScalarSizeInBits() / 8))
-    return true;
+  // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
+  // for stores that come from IR, only require element-size alignment (even if
+  // unaligned accesses are disabled). Without this, these will be forced to
+  // have 16-byte alignment with +strict-align (and fail to lower as we don't
+  // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
+  if (VT.isScalableVector()) {
+    unsigned ElementSizeBits = VT.getScalarSizeInBits();
+    if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
+      return true;
+  }
 
   if (Subtarget->requiresStrictAlign())
     return false;
diff --git a/llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll b/llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll
index 94e120b829424..c5b0651ab01d4 100644
--- a/llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll
+++ b/llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll
@@ -50,9 +50,13 @@ define void @nxv2i64(ptr %ldptr, ptr %stptr) {
   ret void
 }
 
-; FIXME: Support TLI.expandUnalignedLoad()/TLI.expandUnalignedStore() for SVE.
-; define void @unaligned_nxv2i64(ptr %ldptr, ptr %stptr) {
-;   %l3 = load <vscale x 2 x i64>, ptr %ldptr, align 4
-;   store <vscale x 2 x i64> %l3, ptr %stptr, align 4
-;   ret void
-; }
+define void @nxv16i1(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr p0, [x0]
+; CHECK-NEXT:    str p0, [x1]
+; CHECK-NEXT:    ret
+  %l3 = load <vscale x 16 x i1>, ptr %ldptr, align 2
+  store <vscale x 16 x i1> %l3, ptr %stptr, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll b/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll
new file mode 100644
index 0000000000000..62893e6ce7f98
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+strict-align < %s | FileCheck %s
+
+; FIXME: Support TLI.expandUnalignedLoad()/TLI.expandUnalignedStore() for SVE.
+; XFAIL: *
+
+define void @unaligned_nxv16i1(ptr %ldptr, ptr %stptr) {
+  %l3 = load <vscale x 16 x i1>, ptr %ldptr, align 1
+  store <vscale x 16 x i1> %l3, ptr %stptr, align 1
+  ret void
+}
+
+define void @unaligned_nxv2i64(ptr %ldptr, ptr %stptr) {
+  %l3 = load <vscale x 2 x i64>, ptr %ldptr, align 4
+  store <vscale x 2 x i64> %l3, ptr %stptr, align 4
+  ret void
+}

From 890c93b5d900a27dfcb003658bb8995d9b268271 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 13 Dec 2024 15:04:11 +0000
Subject: [PATCH 3/3] Fixups

---
 .../sve-unaligned-load-store-strict-align.ll    | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll b/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll
index 62893e6ce7f98..27637800f751f 100644
--- a/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll
+++ b/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll
@@ -1,17 +1,30 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+strict-align < %s | FileCheck %s
+; RUN: not --crash llc -mtriple=aarch64-linux-gnu -mattr=+sve,+strict-align < %s 2>&1 | FileCheck %s --check-prefix=CHECK-FIXME
+
+; REQUIRES: asserts
 
 ; FIXME: Support TLI.expandUnalignedLoad()/TLI.expandUnalignedStore() for SVE.
-; XFAIL: *
+; CHECK-FIXME: LLVM ERROR: Invalid size request on a scalable vector.
 
 define void @unaligned_nxv16i1(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: unaligned_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr p0, [x0]
+; CHECK-NEXT:    str p0, [x1]
+; CHECK-NEXT:    ret
   %l3 = load <vscale x 16 x i1>, ptr %ldptr, align 1
   store <vscale x 16 x i1> %l3, ptr %stptr, align 1
   ret void
 }
 
 define void @unaligned_nxv2i64(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: unaligned_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
   %l3 = load <vscale x 2 x i64>, ptr %ldptr, align 4
   store <vscale x 2 x i64> %l3, ptr %stptr, align 4
   ret void