Add aarch64a_soft_nofp variant

keith-packard · keith-packard · commit 063492ccabec · 2024-10-07T11:51:32.000-07:00
Build a soft float multilib variant for aarch64 supporting targets
without an FPU.

This contains a couple of minor fixes for llvm to enable this; I'll be
submitting those upstream.

Signed-off-by: Keith Packard &lt;keithp@keithp.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -291,6 +291,8 @@ set(
     ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0003-Disable-failing-compiler-rt-test.patch
     ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0004-libc-tests-with-picolibc-XFAIL-uses-of-atomics.patch
     ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0005-libc-tests-with-picolibc-mark-two-more-large-tests.patch
+    ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0006-libunwind-Support-aarch64-without-FPU.patch
+    ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0007-compiler-rt-Support-aarch64-targets-without-FPU.patch
 )
 FetchContent_Declare(llvmproject
     GIT_REPOSITORY https://github.com/llvm/llvm-project.git
@@ -1614,6 +1616,22 @@ add_library_variants_for_cpu(
     RAM_SIZE 0x1000000
     STACK_SIZE 8K
 )
+add_library_variants_for_cpu(
+    aarch64a
+    SUFFIX soft_nofp
+    COMPILE_FLAGS "-march=armv8-a+nofp+nosimd -mabi=aapcs-soft"
+    MULTILIB_FLAGS "--target=aarch64-unknown-none-elf -march=armv8-a+nofp+nosimd -mabi=aapcs-soft"
+    PICOLIBC_BUILD_TYPE "release"
+    QEMU_MACHINE "virt"
+    QEMU_CPU "cortex-a57"
+    BOOT_FLASH_ADDRESS 0x40000000
+    BOOT_FLASH_SIZE 0x1000
+    FLASH_ADDRESS 0x40001000
+    FLASH_SIZE 0xfff000
+    RAM_ADDRESS 0x41000000
+    RAM_SIZE 0x1000000
+    STACK_SIZE 8K
+)
 # For AArch32, clang uses different defaults for FPU selection than GCC, both
 # when "+fp" or "+fp.dp" are used and when no FPU specifier is provided in
 # "-march=". Using "-mfpu=" explicitly.
diff --git a/patches/llvm-project/0006-libunwind-Support-aarch64-without-FPU.patch b/patches/llvm-project/0006-libunwind-Support-aarch64-without-FPU.patch
@@ -0,0 +1,58 @@
+From dd64908ad215a4f4cc79e3eb507f15b27b04e89f Mon Sep 17 00:00:00 2001
+From: Keith Packard <keithp@keithp.com>
+Date: Fri, 4 Oct 2024 21:06:37 -0700
+Subject: [libunwind] Support aarch64 without FPU
+
+Skip save/restore of FPU registers on targets without them.
+
+Signed-off-by: Keith Packard <keithp@keithp.com>
+---
+ libunwind/src/UnwindRegistersRestore.S | 4 ++--
+ libunwind/src/UnwindRegistersSave.S    | 2 ++
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S
+index 180a66582f..1702d016c3 100644
+--- a/libunwind/src/UnwindRegistersRestore.S
++++ b/libunwind/src/UnwindRegistersRestore.S
+@@ -658,7 +658,7 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
+   ldp    x26,x27, [x0, #0x0D0]
+   ldp    x28,x29, [x0, #0x0E0]
+   ldr    x30,     [x0, #0x100]  // restore pc into lr
+-
++#if defined(__ARM_FP) && __ARM_FP != 0
+   ldp    d0, d1,  [x0, #0x110]
+   ldp    d2, d3,  [x0, #0x120]
+   ldp    d4, d5,  [x0, #0x130]
+@@ -676,7 +676,7 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
+   ldp    d28,d29, [x0, #0x1F0]
+   ldr    d30,     [x0, #0x200]
+   ldr    d31,     [x0, #0x208]
+-
++#endif
+   // Finally, restore sp. This must be done after the last read from the
+   // context struct, because it is allocated on the stack, and an exception
+   // could clobber the de-allocated portion of the stack after sp has been
+diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
+index fab234fcd6..a489a8ba6d 100644
+--- a/libunwind/src/UnwindRegistersSave.S
++++ b/libunwind/src/UnwindRegistersSave.S
+@@ -746,6 +746,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
+   str    x1,      [x0, #0x0F8]
+   str    x30,     [x0, #0x100]    // store return address as pc
+   // skip cpsr
++#if defined(__ARM_FP) && __ARM_FP != 0
+   stp    d0, d1,  [x0, #0x110]
+   stp    d2, d3,  [x0, #0x120]
+   stp    d4, d5,  [x0, #0x130]
+@@ -763,6 +764,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
+   stp    d28,d29, [x0, #0x1F0]
+   str    d30,     [x0, #0x200]
+   str    d31,     [x0, #0x208]
++#endif
+   mov    x0, #0                   // return UNW_ESUCCESS
+   ret
+ 
+-- 
+2.45.2
+
diff --git a/patches/llvm-project/0007-compiler-rt-Support-aarch64-targets-without-FPU.patch b/patches/llvm-project/0007-compiler-rt-Support-aarch64-targets-without-FPU.patch
@@ -0,0 +1,117 @@
+From 04146b444eb70c449a958b246ae4f4e9b52bf4bc Mon Sep 17 00:00:00 2001
+From: Keith Packard <keithp@keithp.com>
+Date: Fri, 4 Oct 2024 21:08:17 -0700
+Subject: [compiler-rt] Support aarch64 targets without FPU
+
+Fall back to the old C implementations of various routines when
+the target doesn't have an FPU.
+
+Signed-off-by: Keith Packard <keithp@keithp.com>
+---
+ .../builtins/aarch64/sme-libc-mem-routines.S  |  2 +-
+ .../lib/builtins/aarch64/sme-libc-routines.c  | 77 +++++++++++++++++++
+ 2 files changed, 78 insertions(+), 1 deletion(-)
+
+diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
+index 0318d9a6f1..72d87fb4fa 100644
+--- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
++++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
+@@ -6,7 +6,7 @@
+ 
+ #include "../assembly.h"
+ 
+-#ifdef __aarch64__
++#if defined(__aarch64__) && __ARM_FP != 0
+ 
+ #define L(l) .L ## l
+ 
+diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
+index 315490e73e..92fb953c03 100644
+--- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
++++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
+@@ -1,5 +1,82 @@
+ #include <stddef.h>
+ 
++#if __ARM_FP == 0
++// WARNING: When building the scalar versions of these functions you need to
++// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
++// from recognising a loop idiom and planting calls to memcpy!
++
++static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
++                                 size_t n) __arm_streaming_compatible {
++  unsigned char *destp = (unsigned char *)dest;
++  const unsigned char *srcp = (const unsigned char *)src;
++  for (size_t i = 0; i < n; ++i)
++    destp[i] = srcp[i];
++
++  return dest;
++}
++
++// If dest and src overlap then behaviour is undefined, hence we can add the
++// restrict keywords here. This also matches the definition of the libc memcpy
++// according to the man page.
++void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
++                      size_t n) __arm_streaming_compatible {
++  return __arm_sc_memcpy_fwd(dest, src, n);
++}
++
++void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
++  unsigned char *destp = (unsigned char *)dest;
++  unsigned char c8 = (unsigned char)c;
++  for (size_t i = 0; i < n; ++i)
++    destp[i] = c8;
++
++  return dest;
++}
++
++static void *__arm_sc_memcpy_rev(void *dest, const void *src,
++                                 size_t n) __arm_streaming_compatible {
++  unsigned char *destp = (unsigned char *)dest;
++  const unsigned char *srcp = (const unsigned char *)src;
++  // TODO: Improve performance by copying larger chunks in reverse, or by
++  // using SVE.
++  while (n > 0) {
++    --n;
++    destp[n] = srcp[n];
++  }
++  return dest;
++}
++
++// Semantically a memmove is equivalent to the following:
++//   1. Copy the entire contents of src to a temporary array that does not
++//      overlap with src or dest.
++//   2. Copy the contents of the temporary array into dest.
++void *__arm_sc_memmove(void *dest, const void *src,
++                       size_t n) __arm_streaming_compatible {
++  unsigned char *destp = (unsigned char *)dest;
++  const unsigned char *srcp = (const unsigned char *)src;
++
++  // If src and dest don't overlap then just invoke memcpy
++  if ((srcp > (destp + n)) || (destp > (srcp + n)))
++    return __arm_sc_memcpy_fwd(dest, src, n);
++
++  // Overlap case 1:
++  //     src: Low     |   ->   |     High
++  //    dest: Low  |   ->   |        High
++  // Here src is always ahead of dest at a higher addres. If we first read a
++  // chunk of data from src we can safely write the same chunk to dest without
++  // corrupting future reads of src.
++  if (srcp > destp)
++    return __arm_sc_memcpy_fwd(dest, src, n);
++
++  // Overlap case 2:
++  //     src: Low  |   ->   |        High
++  //    dest: Low     |   ->   |     High
++  // While we're in the overlap region we're always corrupting future reads of
++  // src when writing to dest. An efficient way to do this is to copy the data
++  // in reverse by starting at the highest address.
++  return __arm_sc_memcpy_rev(dest, src, n);
++}
++#endif
++
+ const void *__arm_sc_memchr(const void *src, int c,
+                             size_t n) __arm_streaming_compatible {
+   const unsigned char *srcp = (const unsigned char *)src;
+-- 
+2.45.2
+