Skip to content

Commit 267da96

Browse files
committed
Add aarch64a_soft_nofp variant
Build a soft float multilib variant for aarch64 supporting targets without an FPU. This contains a couple of minor fixes for llvm to enable this; I'll be submitting those upstream. Signed-off-by: Keith Packard <[email protected]>
1 parent 1133650 commit 267da96

File tree

3 files changed

+325
-0
lines changed

3 files changed

+325
-0
lines changed

CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,8 @@ set(
291291
${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0003-Disable-failing-compiler-rt-test.patch
292292
${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0004-libc-tests-with-picolibc-XFAIL-uses-of-atomics.patch
293293
${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0005-libc-tests-with-picolibc-mark-two-more-large-tests.patch
294+
${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0007-libunwind-Support-aarch64-without-FPU.patch
295+
${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm-project/0008-compiler-rt-Support-aarch64-targets-without-FPU.patch
294296
)
295297
FetchContent_Declare(llvmproject
296298
GIT_REPOSITORY https://github.com/llvm/llvm-project.git
@@ -1614,6 +1616,22 @@ add_library_variants_for_cpu(
16141616
RAM_SIZE 0x1000000
16151617
STACK_SIZE 8K
16161618
)
1619+
add_library_variants_for_cpu(
1620+
aarch64a
1621+
SUFFIX soft_nofp
1622+
COMPILE_FLAGS "-march=armv8-a+nofp+nosimd -mabi=aapcs-soft"
1623+
MULTILIB_FLAGS "--target=aarch64-unknown-none-elf -march=armv8-a+nofp+nosimd -mabi=aapcs-soft"
1624+
PICOLIBC_BUILD_TYPE "release"
1625+
QEMU_MACHINE "virt"
1626+
QEMU_CPU "cortex-a57"
1627+
BOOT_FLASH_ADDRESS 0x40000000
1628+
BOOT_FLASH_SIZE 0x1000
1629+
FLASH_ADDRESS 0x40001000
1630+
FLASH_SIZE 0xfff000
1631+
RAM_ADDRESS 0x41000000
1632+
RAM_SIZE 0x1000000
1633+
STACK_SIZE 8K
1634+
)
16171635
# For AArch32, clang uses different defaults for FPU selection than GCC, both
16181636
# when "+fp" or "+fp.dp" are used and when no FPU specifier is provided in
16191637
# "-march=". Using "-mfpu=" explicitly.
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
From 6507d75387f0d5b7f926d5a60a238e930c9453b3 Mon Sep 17 00:00:00 2001
2+
From: Keith Packard <[email protected]>
3+
Date: Fri, 4 Oct 2024 21:06:37 -0700
4+
Subject: [PATCH 6/7] [libunwind] Support aarch64 without FPU
5+
6+
ldp and stp instructions both require an FPU. Use pairs of ldr or str
7+
instructions when the target doesn't have one.
8+
9+
Signed-off-by: Keith Packard <[email protected]>
10+
---
11+
libunwind/src/UnwindRegistersRestore.S | 36 +++++++------
12+
libunwind/src/UnwindRegistersSave.S | 70 ++++++++++++++------------
13+
2 files changed, 61 insertions(+), 45 deletions(-)
14+
15+
diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S
16+
index 180a66582f..13a40d080c 100644
17+
--- a/libunwind/src/UnwindRegistersRestore.S
18+
+++ b/libunwind/src/UnwindRegistersRestore.S
19+
@@ -633,6 +633,12 @@ Lnovec:
20+
.arch_extension gcs
21+
#endif
22+
23+
+#if defined(__ARM_FP) && __ARM_FP != 0
24+
+#define LDP(a,b,r,o,p) stp a, b, [r, o]
25+
+#else
26+
+#define LDP(a,b,r,o,p) ldr a, [r, o] ; ldr b, [r, p]
27+
+#endif
28+
+
29+
//
30+
// extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *);
31+
//
32+
@@ -642,23 +648,24 @@ Lnovec:
33+
.p2align 2
34+
DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
35+
// skip restore of x0,x1 for now
36+
- ldp x2, x3, [x0, #0x010]
37+
- ldp x4, x5, [x0, #0x020]
38+
- ldp x6, x7, [x0, #0x030]
39+
- ldp x8, x9, [x0, #0x040]
40+
- ldp x10,x11, [x0, #0x050]
41+
- ldp x12,x13, [x0, #0x060]
42+
- ldp x14,x15, [x0, #0x070]
43+
+ LDP(x2, x3, x0, #0x010, #0x018)
44+
+ LDP(x4, x5, x0, #0x020, #0x028)
45+
+ LDP(x6, x7, x0, #0x030, #0x038)
46+
+ LDP(x8, x9, x0, #0x040, #0x048)
47+
+ LDP(x10, x11, x0, #0x050, #0x058)
48+
+ LDP(x12, x13, x0, #0x060, #0x068)
49+
+ LDP(x14, x15, x0, #0x070, #0x078)
50+
// x16 and x17 were clobbered by the call into the unwinder, so no point in
51+
// restoring them.
52+
- ldp x18,x19, [x0, #0x090]
53+
- ldp x20,x21, [x0, #0x0A0]
54+
- ldp x22,x23, [x0, #0x0B0]
55+
- ldp x24,x25, [x0, #0x0C0]
56+
- ldp x26,x27, [x0, #0x0D0]
57+
- ldp x28,x29, [x0, #0x0E0]
58+
+ LDP(x18, x19, x0, #0x090, #0x098)
59+
+ LDP(x20, x21, x0, #0x0A0, #0x0A8)
60+
+ LDP(x22, x23, x0, #0x0B0, #0x0B8)
61+
+ LDP(x24, x25, x0, #0x0C0, #0x0C8)
62+
+ LDP(x26, x27, x0, #0x0D0, #0x0D8)
63+
+ LDP(x28, x29, x0, #0x0E0, #0x0E8)
64+
ldr x30, [x0, #0x100] // restore pc into lr
65+
66+
+#if defined(__ARM_FP) && __ARM_FP != 0
67+
ldp d0, d1, [x0, #0x110]
68+
ldp d2, d3, [x0, #0x120]
69+
ldp d4, d5, [x0, #0x130]
70+
@@ -676,13 +683,14 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
71+
ldp d28,d29, [x0, #0x1F0]
72+
ldr d30, [x0, #0x200]
73+
ldr d31, [x0, #0x208]
74+
+#endif
75+
76+
// Finally, restore sp. This must be done after the last read from the
77+
// context struct, because it is allocated on the stack, and an exception
78+
// could clobber the de-allocated portion of the stack after sp has been
79+
// restored.
80+
ldr x16, [x0, #0x0F8]
81+
- ldp x0, x1, [x0, #0x000] // restore x0,x1
82+
+ LDP(x0, x1, x0, #0x000, #0x008) // restore x0,x1
83+
mov sp,x16 // restore sp
84+
#if defined(__ARM_FEATURE_GCS_DEFAULT)
85+
// If GCS is enabled we need to push the address we're returning to onto the
86+
diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
87+
index fab234fcd6..712db25604 100644
88+
--- a/libunwind/src/UnwindRegistersSave.S
89+
+++ b/libunwind/src/UnwindRegistersSave.S
90+
@@ -646,7 +646,7 @@ LnoR2Fix:
91+
stfd 17,296(3)
92+
stfd 18,304(3)
93+
stfd 19,312(3)
94+
- stfd 20,320(3)
95+
+a stfd 20,320(3)
96+
stfd 21,328(3)
97+
stfd 22,336(3)
98+
stfd 23,344(3)
99+
@@ -718,6 +718,12 @@ LnoR2Fix:
100+
101+
#elif defined(__aarch64__)
102+
103+
+#if defined(__ARM_FP) && __ARM_FP != 0
104+
+#define STP(a,b,r,o,p) stp a, b, [r, o]
105+
+#else
106+
+#define STP(a,b,r,o,p) str a, [r, o] ; str b, [r, p]
107+
+#endif
108+
+
109+
//
110+
// extern int __unw_getcontext(unw_context_t* thread_state)
111+
//
112+
@@ -726,43 +732,45 @@ LnoR2Fix:
113+
//
114+
.p2align 2
115+
DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
116+
- stp x0, x1, [x0, #0x000]
117+
- stp x2, x3, [x0, #0x010]
118+
- stp x4, x5, [x0, #0x020]
119+
- stp x6, x7, [x0, #0x030]
120+
- stp x8, x9, [x0, #0x040]
121+
- stp x10,x11, [x0, #0x050]
122+
- stp x12,x13, [x0, #0x060]
123+
- stp x14,x15, [x0, #0x070]
124+
- stp x16,x17, [x0, #0x080]
125+
- stp x18,x19, [x0, #0x090]
126+
- stp x20,x21, [x0, #0x0A0]
127+
- stp x22,x23, [x0, #0x0B0]
128+
- stp x24,x25, [x0, #0x0C0]
129+
- stp x26,x27, [x0, #0x0D0]
130+
- stp x28,x29, [x0, #0x0E0]
131+
+ STP(x0, x1, x0, #0x000, #0x008)
132+
+ STP(x2, x3, x0, #0x010, #0x018)
133+
+ STP(x4, x5, x0, #0x020, #0x028)
134+
+ STP(x6, x7, x0, #0x030, #0x038)
135+
+ STP(x8, x9, x0, #0x040, #0x048)
136+
+ STP(x10, x11, x0, #0x050, #0x058)
137+
+ STP(x12, x13, x0, #0x060, #0x068)
138+
+ STP(x14, x15, x0, #0x070, #0x078)
139+
+ STP(x16, x17, x0, #0x080, #0x088)
140+
+ STP(x18, x19, x0, #0x090, #0x098)
141+
+ STP(x20, x21, x0, #0x0A0, #0x0A8)
142+
+ STP(x22, x23, x0, #0x0B0, #0x0B8)
143+
+ STP(x24, x25, x0, #0x0C0, #0x0C8)
144+
+ STP(x26, x27, x0, #0x0D0, #0x0D8)
145+
+ STP(x28, x29, x0, #0x0E0, #0x0E8)
146+
str x30, [x0, #0x0F0]
147+
mov x1,sp
148+
str x1, [x0, #0x0F8]
149+
str x30, [x0, #0x100] // store return address as pc
150+
// skip cpsr
151+
- stp d0, d1, [x0, #0x110]
152+
- stp d2, d3, [x0, #0x120]
153+
- stp d4, d5, [x0, #0x130]
154+
- stp d6, d7, [x0, #0x140]
155+
- stp d8, d9, [x0, #0x150]
156+
- stp d10,d11, [x0, #0x160]
157+
- stp d12,d13, [x0, #0x170]
158+
- stp d14,d15, [x0, #0x180]
159+
- stp d16,d17, [x0, #0x190]
160+
- stp d18,d19, [x0, #0x1A0]
161+
- stp d20,d21, [x0, #0x1B0]
162+
- stp d22,d23, [x0, #0x1C0]
163+
- stp d24,d25, [x0, #0x1D0]
164+
- stp d26,d27, [x0, #0x1E0]
165+
- stp d28,d29, [x0, #0x1F0]
166+
+#if defined(__ARM_FP) && __ARM_FP != 0
167+
+ STP(d0, d1, x0, #0x110, #0x118)
168+
+ STP(d2, d3, x0, #0x120, #0x128)
169+
+ STP(d4, d5, x0, #0x130, #0x138)
170+
+ STP(d6, d7, x0, #0x140, #0x148)
171+
+ STP(d8, d9, x0, #0x150, #0x158)
172+
+ STP(d10, d11, x0, #0x160, #0x168)
173+
+ STP(d12, d13, x0, #0x170, #0x178)
174+
+ STP(d14, d15, x0, #0x180, #0x188)
175+
+ STP(d16, d17, x0, #0x190, #0x198)
176+
+ STP(d18, d19, x0, #0x1A0, #0x1A8)
177+
+ STP(d20, d21, x0, #0x1B0, #0x1B8)
178+
+ STP(d22, d23, x0, #0x1C0, #0x1C8)
179+
+ STP(d24, d25, x0, #0x1D0, #0x1D8)
180+
+ STP(d26, d27, x0, #0x1E0, #0x1E8)
181+
+ STP(d28, d29, x0, #0x1F0, #0x1F8)
182+
str d30, [x0, #0x200]
183+
str d31, [x0, #0x208]
184+
+#endif
185+
mov x0, #0 // return UNW_ESUCCESS
186+
ret
187+
188+
--
189+
2.45.2
190+
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
From 71691b4bbd32da80437e38e4e8b48b65c71b7142 Mon Sep 17 00:00:00 2001
2+
From: Keith Packard <[email protected]>
3+
Date: Fri, 4 Oct 2024 21:08:17 -0700
4+
Subject: [PATCH 7/7] [compiler-rt] Support aarch64 targets without FPU
5+
6+
Fall back to the old C implementations of various routines when
7+
the target doesn't have an FPU.
8+
9+
Signed-off-by: Keith Packard <[email protected]>
10+
---
11+
.../builtins/aarch64/sme-libc-mem-routines.S | 2 +-
12+
.../lib/builtins/aarch64/sme-libc-routines.c | 77 +++++++++++++++++++
13+
2 files changed, 78 insertions(+), 1 deletion(-)
14+
15+
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
16+
index 0318d9a6f1..72d87fb4fa 100644
17+
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
18+
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
19+
@@ -6,7 +6,7 @@
20+
21+
#include "../assembly.h"
22+
23+
-#ifdef __aarch64__
24+
+#if defined(__aarch64__) && __ARM_FP != 0
25+
26+
#define L(l) .L ## l
27+
28+
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
29+
index 315490e73e..92fb953c03 100644
30+
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
31+
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
32+
@@ -1,5 +1,82 @@
33+
#include <stddef.h>
34+
35+
+#if __ARM_FP == 0
36+
+// WARNING: When building the scalar versions of these functions you need to
37+
+// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
38+
+// from recognising a loop idiom and planting calls to memcpy!
39+
+
40+
+static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
41+
+ size_t n) __arm_streaming_compatible {
42+
+ unsigned char *destp = (unsigned char *)dest;
43+
+ const unsigned char *srcp = (const unsigned char *)src;
44+
+ for (size_t i = 0; i < n; ++i)
45+
+ destp[i] = srcp[i];
46+
+
47+
+ return dest;
48+
+}
49+
+
50+
+// If dest and src overlap then behaviour is undefined, hence we can add the
51+
+// restrict keywords here. This also matches the definition of the libc memcpy
52+
+// according to the man page.
53+
+void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
54+
+ size_t n) __arm_streaming_compatible {
55+
+ return __arm_sc_memcpy_fwd(dest, src, n);
56+
+}
57+
+
58+
+void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
59+
+ unsigned char *destp = (unsigned char *)dest;
60+
+ unsigned char c8 = (unsigned char)c;
61+
+ for (size_t i = 0; i < n; ++i)
62+
+ destp[i] = c8;
63+
+
64+
+ return dest;
65+
+}
66+
+
67+
+static void *__arm_sc_memcpy_rev(void *dest, const void *src,
68+
+ size_t n) __arm_streaming_compatible {
69+
+ unsigned char *destp = (unsigned char *)dest;
70+
+ const unsigned char *srcp = (const unsigned char *)src;
71+
+ // TODO: Improve performance by copying larger chunks in reverse, or by
72+
+ // using SVE.
73+
+ while (n > 0) {
74+
+ --n;
75+
+ destp[n] = srcp[n];
76+
+ }
77+
+ return dest;
78+
+}
79+
+
80+
+// Semantically a memmove is equivalent to the following:
81+
+// 1. Copy the entire contents of src to a temporary array that does not
82+
+// overlap with src or dest.
83+
+// 2. Copy the contents of the temporary array into dest.
84+
+void *__arm_sc_memmove(void *dest, const void *src,
85+
+ size_t n) __arm_streaming_compatible {
86+
+ unsigned char *destp = (unsigned char *)dest;
87+
+ const unsigned char *srcp = (const unsigned char *)src;
88+
+
89+
+ // If src and dest don't overlap then just invoke memcpy
90+
+ if ((srcp > (destp + n)) || (destp > (srcp + n)))
91+
+ return __arm_sc_memcpy_fwd(dest, src, n);
92+
+
93+
+ // Overlap case 1:
94+
+ // src: Low | -> | High
95+
+ // dest: Low | -> | High
96+
+ // Here src is always ahead of dest at a higher addres. If we first read a
97+
+ // chunk of data from src we can safely write the same chunk to dest without
98+
+ // corrupting future reads of src.
99+
+ if (srcp > destp)
100+
+ return __arm_sc_memcpy_fwd(dest, src, n);
101+
+
102+
+ // Overlap case 2:
103+
+ // src: Low | -> | High
104+
+ // dest: Low | -> | High
105+
+ // While we're in the overlap region we're always corrupting future reads of
106+
+ // src when writing to dest. An efficient way to do this is to copy the data
107+
+ // in reverse by starting at the highest address.
108+
+ return __arm_sc_memcpy_rev(dest, src, n);
109+
+}
110+
+#endif
111+
+
112+
const void *__arm_sc_memchr(const void *src, int c,
113+
size_t n) __arm_streaming_compatible {
114+
const unsigned char *srcp = (const unsigned char *)src;
115+
--
116+
2.45.2
117+

0 commit comments

Comments
 (0)