diff --git a/CMakeLists.txt b/CMakeLists.txt index 43ad958f..869266b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -291,6 +291,8 @@ set( ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0003-Disable-failing-compiler-rt-test.patch ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0004-libc-tests-with-picolibc-XFAIL-uses-of-atomics.patch ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0005-libc-tests-with-picolibc-mark-two-more-large-tests.patch + ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0006-libunwind-Support-aarch64-without-FPU.patch + ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0007-compiler-rt-Support-aarch64-targets-without-FPU.patch ) FetchContent_Declare(llvmproject GIT_REPOSITORY https://github.com/llvm/llvm-project.git @@ -1614,6 +1616,22 @@ add_library_variants_for_cpu( RAM_SIZE 0x1000000 STACK_SIZE 8K ) +add_library_variants_for_cpu( + aarch64a + SUFFIX soft_nofp + COMPILE_FLAGS "-march=armv8-a+nofp+nosimd -mabi=aapcs-soft" + MULTILIB_FLAGS "--target=aarch64-unknown-none-elf -march=armv8-a+nofp+nosimd -mabi=aapcs-soft" + PICOLIBC_BUILD_TYPE "release" + QEMU_MACHINE "virt" + QEMU_CPU "cortex-a57" + BOOT_FLASH_ADDRESS 0x40000000 + BOOT_FLASH_SIZE 0x1000 + FLASH_ADDRESS 0x40001000 + FLASH_SIZE 0xfff000 + RAM_ADDRESS 0x41000000 + RAM_SIZE 0x1000000 + STACK_SIZE 8K +) # For AArch32, clang uses different defaults for FPU selection than GCC, both # when "+fp" or "+fp.dp" are used and when no FPU specifier is provided in # "-march=". Using "-mfpu=" explicitly. diff --git a/patches/llvm-project/0006-libunwind-Support-aarch64-without-FPU.patch b/patches/llvm-project/0006-libunwind-Support-aarch64-without-FPU.patch new file mode 100644 index 00000000..faba5f30 --- /dev/null +++ b/patches/llvm-project/0006-libunwind-Support-aarch64-without-FPU.patch @@ -0,0 +1,58 @@ +From dd64908ad215a4f4cc79e3eb507f15b27b04e89f Mon Sep 17 00:00:00 2001 +From: Keith Packard +Date: Fri, 4 Oct 2024 21:06:37 -0700 +Subject: [libunwind] Support aarch64 without FPU + +Skip save/restore of FPU registers on targets without them. + +Signed-off-by: Keith Packard +--- + libunwind/src/UnwindRegistersRestore.S | 4 ++-- + libunwind/src/UnwindRegistersSave.S | 2 ++ + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S +index 180a66582f..1702d016c3 100644 +--- a/libunwind/src/UnwindRegistersRestore.S ++++ b/libunwind/src/UnwindRegistersRestore.S +@@ -658,7 +658,7 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto) + ldp x26,x27, [x0, #0x0D0] + ldp x28,x29, [x0, #0x0E0] + ldr x30, [x0, #0x100] // restore pc into lr +- ++#if defined(__ARM_FP) && __ARM_FP != 0 + ldp d0, d1, [x0, #0x110] + ldp d2, d3, [x0, #0x120] + ldp d4, d5, [x0, #0x130] +@@ -676,7 +676,7 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto) + ldp d28,d29, [x0, #0x1F0] + ldr d30, [x0, #0x200] + ldr d31, [x0, #0x208] +- ++#endif + // Finally, restore sp. This must be done after the last read from the + // context struct, because it is allocated on the stack, and an exception + // could clobber the de-allocated portion of the stack after sp has been +diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S +index fab234fcd6..a489a8ba6d 100644 +--- a/libunwind/src/UnwindRegistersSave.S ++++ b/libunwind/src/UnwindRegistersSave.S +@@ -746,6 +746,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) + str x1, [x0, #0x0F8] + str x30, [x0, #0x100] // store return address as pc + // skip cpsr ++#if defined(__ARM_FP) && __ARM_FP != 0 + stp d0, d1, [x0, #0x110] + stp d2, d3, [x0, #0x120] + stp d4, d5, [x0, #0x130] +@@ -763,6 +764,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) + stp d28,d29, [x0, #0x1F0] + str d30, [x0, #0x200] + str d31, [x0, #0x208] ++#endif + mov x0, #0 // return UNW_ESUCCESS + ret + +-- +2.45.2 + diff --git a/patches/llvm-project/0007-compiler-rt-Support-aarch64-targets-without-FPU.patch b/patches/llvm-project/0007-compiler-rt-Support-aarch64-targets-without-FPU.patch new file mode 100644 index 00000000..d8be16d1 --- /dev/null +++ b/patches/llvm-project/0007-compiler-rt-Support-aarch64-targets-without-FPU.patch @@ -0,0 +1,117 @@ +From 04146b444eb70c449a958b246ae4f4e9b52bf4bc Mon Sep 17 00:00:00 2001 +From: Keith Packard +Date: Fri, 4 Oct 2024 21:08:17 -0700 +Subject: [compiler-rt] Support aarch64 targets without FPU + +Fall back to the old C implementations of various routines when +the target doesn't have an FPU. + +Signed-off-by: Keith Packard +--- + .../builtins/aarch64/sme-libc-mem-routines.S | 2 +- + .../lib/builtins/aarch64/sme-libc-routines.c | 77 +++++++++++++++++++ + 2 files changed, 78 insertions(+), 1 deletion(-) + +diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S +index 0318d9a6f1..72d87fb4fa 100644 +--- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S ++++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S +@@ -6,7 +6,7 @@ + + #include "../assembly.h" + +-#ifdef __aarch64__ ++#if defined(__aarch64__) && __ARM_FP != 0 + + #define L(l) .L ## l + +diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c +index 315490e73e..92fb953c03 100644 +--- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c ++++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c +@@ -1,5 +1,82 @@ + #include + ++#if __ARM_FP == 0 ++// WARNING: When building the scalar versions of these functions you need to ++// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang ++// from recognising a loop idiom and planting calls to memcpy! ++ ++static void *__arm_sc_memcpy_fwd(void *dest, const void *src, ++ size_t n) __arm_streaming_compatible { ++ unsigned char *destp = (unsigned char *)dest; ++ const unsigned char *srcp = (const unsigned char *)src; ++ for (size_t i = 0; i < n; ++i) ++ destp[i] = srcp[i]; ++ ++ return dest; ++} ++ ++// If dest and src overlap then behaviour is undefined, hence we can add the ++// restrict keywords here. This also matches the definition of the libc memcpy ++// according to the man page. ++void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src, ++ size_t n) __arm_streaming_compatible { ++ return __arm_sc_memcpy_fwd(dest, src, n); ++} ++ ++void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible { ++ unsigned char *destp = (unsigned char *)dest; ++ unsigned char c8 = (unsigned char)c; ++ for (size_t i = 0; i < n; ++i) ++ destp[i] = c8; ++ ++ return dest; ++} ++ ++static void *__arm_sc_memcpy_rev(void *dest, const void *src, ++ size_t n) __arm_streaming_compatible { ++ unsigned char *destp = (unsigned char *)dest; ++ const unsigned char *srcp = (const unsigned char *)src; ++ // TODO: Improve performance by copying larger chunks in reverse, or by ++ // using SVE. ++ while (n > 0) { ++ --n; ++ destp[n] = srcp[n]; ++ } ++ return dest; ++} ++ ++// Semantically a memmove is equivalent to the following: ++// 1. Copy the entire contents of src to a temporary array that does not ++// overlap with src or dest. ++// 2. Copy the contents of the temporary array into dest. ++void *__arm_sc_memmove(void *dest, const void *src, ++ size_t n) __arm_streaming_compatible { ++ unsigned char *destp = (unsigned char *)dest; ++ const unsigned char *srcp = (const unsigned char *)src; ++ ++ // If src and dest don't overlap then just invoke memcpy ++ if ((srcp > (destp + n)) || (destp > (srcp + n))) ++ return __arm_sc_memcpy_fwd(dest, src, n); ++ ++ // Overlap case 1: ++ // src: Low | -> | High ++ // dest: Low | -> | High ++ // Here src is always ahead of dest at a higher addres. If we first read a ++ // chunk of data from src we can safely write the same chunk to dest without ++ // corrupting future reads of src. ++ if (srcp > destp) ++ return __arm_sc_memcpy_fwd(dest, src, n); ++ ++ // Overlap case 2: ++ // src: Low | -> | High ++ // dest: Low | -> | High ++ // While we're in the overlap region we're always corrupting future reads of ++ // src when writing to dest. An efficient way to do this is to copy the data ++ // in reverse by starting at the highest address. ++ return __arm_sc_memcpy_rev(dest, src, n); ++} ++#endif ++ + const void *__arm_sc_memchr(const void *src, int c, + size_t n) __arm_streaming_compatible { + const unsigned char *srcp = (const unsigned char *)src; +-- +2.45.2 +