Skip to content

Commit 22b80d2

Browse files
author
Nicolas Pitre
committed
arch: arm64: Implement SVE context switching for ARMv9-A
Implement Scalable Vector Extension (SVE) context switching support, enabling threads to use SVE instructions with lazy context preservation across task switches. The implementation is incremental: if only FPU instructions are used then only the NEON access is granted and preserved to minimize context switching overhead. If SVE is used then the NEON context is upgraded to SVE and then full SVE access is granted and preserved from that point onwards. Signed-off-by: Nicolas Pitre <[email protected]>
1 parent 1bf36e2 commit 22b80d2

File tree

12 files changed

+446
-37
lines changed

12 files changed

+446
-37
lines changed

arch/arm64/core/Kconfig

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ config ARMV9_A
249249
select CPU_HAS_MMU
250250
select ARCH_HAS_USERSPACE if ARM_MMU
251251
select ARCH_HAS_NOCACHE_MEMORY_SUPPORT if ARM_MMU
252+
imply ARM64_SVE if FPU_SHARING
252253
help
253254
This option signifies the use of an ARMv9-A processor
254255
implementation.
@@ -413,4 +414,27 @@ config ARM64_BOOT_DISABLE_DCACHE
413414
cache and then disable data cache, it will will be re-enabled after
414415
MMU is configured and enabled.
415416

417+
config ARM64_SVE
418+
bool "Scalable Vector Extension (SVE) support"
419+
depends on ARMV9_A
420+
help
421+
Enable support for ARM64 Scalable Vector Extension (SVE).
422+
This allows threads to use SVE instructions and automatically
423+
handles context switching of SVE registers (Z0-Z31, P0-P15, FFR)
424+
if CONFIG_FPU_SHARING is also set. Requires ARMv9-A architecture.
425+
426+
config ARM64_SVE_VL_MAX
427+
int "Maximum SVE vector length in bytes"
428+
depends on ARM64_SVE
429+
default 16
430+
range 16 256
431+
help
432+
Maximum supported SVE vector length in bytes. This determines
433+
the SVE context size within each thread structure. Valid values
434+
are any power of two from 16 to 256 inclusive (128 to 2048 bits).
435+
This can be smaller than the hardware supported vector length to
436+
save some per-thread memory in which case the hardware will be
437+
limited to the specified length. Having a larger value than what
438+
the hardware supports will only waste memory.
439+
416440
endif # CPU_CORTEX_A || CPU_AARCH64_CORTEX_R

arch/arm64/core/fatal.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ static bool z_arm64_stack_corruption_check(struct arch_esf *esf, uint64_t esr, u
337337
* a new nested exception triggered by FPU accessing (var_args).
338338
*/
339339
arch_flush_local_fpu();
340-
write_cpacr_el1(read_cpacr_el1() | CPACR_EL1_FPEN_NOTRAP);
340+
write_cpacr_el1(read_cpacr_el1() | CPACR_EL1_FPEN);
341341
#endif
342342
arch_curr_cpu()->arch.corrupted_sp = 0UL;
343343
EXCEPTION_DUMP("STACK OVERFLOW FROM KERNEL,"

arch/arm64/core/fpu.S

Lines changed: 174 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,20 @@
77

88
#include <zephyr/toolchain.h>
99
#include <zephyr/linker/sections.h>
10+
#include <zephyr/offsets.h>
1011

1112
_ASM_FILE_PROLOGUE
1213

1314
GTEXT(z_arm64_fpu_save)
1415
SECTION_FUNC(TEXT, z_arm64_fpu_save)
1516

17+
mrs x1, fpsr
18+
mrs x2, fpcr
19+
str w1, [x0, #__z_arm64_fp_context_fpsr_OFFSET]
20+
str w2, [x0, #__z_arm64_fp_context_fpcr_OFFSET]
21+
22+
/* Save NEON registers */
23+
add x0, x0, #__z_arm64_fp_context_neon_OFFSET
1624
stp q0, q1, [x0, #(16 * 0)]
1725
stp q2, q3, [x0, #(16 * 2)]
1826
stp q4, q5, [x0, #(16 * 4)]
@@ -30,16 +38,18 @@ SECTION_FUNC(TEXT, z_arm64_fpu_save)
3038
stp q28, q29, [x0, #(16 * 28)]
3139
stp q30, q31, [x0, #(16 * 30)]
3240

33-
mrs x1, fpsr
34-
mrs x2, fpcr
35-
str w1, [x0, #(16 * 32 + 0)]
36-
str w2, [x0, #(16 * 32 + 4)]
37-
3841
ret
3942

4043
GTEXT(z_arm64_fpu_restore)
4144
SECTION_FUNC(TEXT, z_arm64_fpu_restore)
4245

46+
ldr w1, [x0, #__z_arm64_fp_context_fpsr_OFFSET]
47+
ldr w2, [x0, #__z_arm64_fp_context_fpcr_OFFSET]
48+
msr fpsr, x1
49+
msr fpcr, x2
50+
51+
/* Restore NEON registers */
52+
add x0, x0, #__z_arm64_fp_context_neon_OFFSET
4353
ldp q0, q1, [x0, #(16 * 0)]
4454
ldp q2, q3, [x0, #(16 * 2)]
4555
ldp q4, q5, [x0, #(16 * 4)]
@@ -57,9 +67,164 @@ SECTION_FUNC(TEXT, z_arm64_fpu_restore)
5767
ldp q28, q29, [x0, #(16 * 28)]
5868
ldp q30, q31, [x0, #(16 * 30)]
5969

60-
ldr w1, [x0, #(16 * 32 + 0)]
61-
ldr w2, [x0, #(16 * 32 + 4)]
62-
msr fpsr, x1
63-
msr fpcr, x2
70+
ret
71+
72+
#ifdef CONFIG_ARM64_SVE
73+
74+
GTEXT(z_arm64_sve_save)
75+
SECTION_FUNC(TEXT, z_arm64_sve_save)
76+
77+
/* Save control registers */
78+
mrs x2, fpsr
79+
mrs x3, fpcr
80+
str w2, [x0, #__z_arm64_fp_context_fpsr_OFFSET]
81+
str w3, [x0, #__z_arm64_fp_context_fpcr_OFFSET]
82+
83+
/* Get Z registers base address */
84+
add x2, x0, #__z_arm64_fp_context_sve_z_regs_OFFSET
85+
86+
/* Save Z registers */
87+
str z0, [x2, #0, MUL VL]
88+
str z1, [x2, #1, MUL VL]
89+
str z2, [x2, #2, MUL VL]
90+
str z3, [x2, #3, MUL VL]
91+
str z4, [x2, #4, MUL VL]
92+
str z5, [x2, #5, MUL VL]
93+
str z6, [x2, #6, MUL VL]
94+
str z7, [x2, #7, MUL VL]
95+
str z8, [x2, #8, MUL VL]
96+
str z9, [x2, #9, MUL VL]
97+
str z10, [x2, #10, MUL VL]
98+
str z11, [x2, #11, MUL VL]
99+
str z12, [x2, #12, MUL VL]
100+
str z13, [x2, #13, MUL VL]
101+
str z14, [x2, #14, MUL VL]
102+
str z15, [x2, #15, MUL VL]
103+
str z16, [x2, #16, MUL VL]
104+
str z17, [x2, #17, MUL VL]
105+
str z18, [x2, #18, MUL VL]
106+
str z19, [x2, #19, MUL VL]
107+
str z20, [x2, #20, MUL VL]
108+
str z21, [x2, #21, MUL VL]
109+
str z22, [x2, #22, MUL VL]
110+
str z23, [x2, #23, MUL VL]
111+
str z24, [x2, #24, MUL VL]
112+
str z25, [x2, #25, MUL VL]
113+
str z26, [x2, #26, MUL VL]
114+
str z27, [x2, #27, MUL VL]
115+
str z28, [x2, #28, MUL VL]
116+
str z29, [x2, #29, MUL VL]
117+
str z30, [x2, #30, MUL VL]
118+
str z31, [x2, #31, MUL VL]
119+
120+
/* Get P registers base address */
121+
mov x3, #__z_arm64_fp_context_sve_p_regs_OFFSET
122+
add x3, x0, x3
123+
124+
/* Save P registers */
125+
str p0, [x3, #0, MUL VL]
126+
str p1, [x3, #1, MUL VL]
127+
str p2, [x3, #2, MUL VL]
128+
str p3, [x3, #3, MUL VL]
129+
str p4, [x3, #4, MUL VL]
130+
str p5, [x3, #5, MUL VL]
131+
str p6, [x3, #6, MUL VL]
132+
str p7, [x3, #7, MUL VL]
133+
str p8, [x3, #8, MUL VL]
134+
str p9, [x3, #9, MUL VL]
135+
str p10, [x3, #10, MUL VL]
136+
str p11, [x3, #11, MUL VL]
137+
str p12, [x3, #12, MUL VL]
138+
str p13, [x3, #13, MUL VL]
139+
str p14, [x3, #14, MUL VL]
140+
str p15, [x3, #15, MUL VL]
141+
142+
/* Get FFR base address */
143+
mov x4, #__z_arm64_fp_context_sve_ffr_OFFSET
144+
add x4, x0, x4
145+
146+
/* Save FFR */
147+
rdffr p0.b
148+
str p0, [x4]
149+
150+
ret
151+
152+
GTEXT(z_arm64_sve_restore)
153+
SECTION_FUNC(TEXT, z_arm64_sve_restore)
154+
155+
/* Get Z registers base address */
156+
add x2, x0, #__z_arm64_fp_context_sve_z_regs_OFFSET
157+
158+
/* Restore Z registers */
159+
ldr z0, [x2, #0, MUL VL]
160+
ldr z1, [x2, #1, MUL VL]
161+
ldr z2, [x2, #2, MUL VL]
162+
ldr z3, [x2, #3, MUL VL]
163+
ldr z4, [x2, #4, MUL VL]
164+
ldr z5, [x2, #5, MUL VL]
165+
ldr z6, [x2, #6, MUL VL]
166+
ldr z7, [x2, #7, MUL VL]
167+
ldr z8, [x2, #8, MUL VL]
168+
ldr z9, [x2, #9, MUL VL]
169+
ldr z10, [x2, #10, MUL VL]
170+
ldr z11, [x2, #11, MUL VL]
171+
ldr z12, [x2, #12, MUL VL]
172+
ldr z13, [x2, #13, MUL VL]
173+
ldr z14, [x2, #14, MUL VL]
174+
ldr z15, [x2, #15, MUL VL]
175+
ldr z16, [x2, #16, MUL VL]
176+
ldr z17, [x2, #17, MUL VL]
177+
ldr z18, [x2, #18, MUL VL]
178+
ldr z19, [x2, #19, MUL VL]
179+
ldr z20, [x2, #20, MUL VL]
180+
ldr z21, [x2, #21, MUL VL]
181+
ldr z22, [x2, #22, MUL VL]
182+
ldr z23, [x2, #23, MUL VL]
183+
ldr z24, [x2, #24, MUL VL]
184+
ldr z25, [x2, #25, MUL VL]
185+
ldr z26, [x2, #26, MUL VL]
186+
ldr z27, [x2, #27, MUL VL]
187+
ldr z28, [x2, #28, MUL VL]
188+
ldr z29, [x2, #29, MUL VL]
189+
ldr z30, [x2, #30, MUL VL]
190+
ldr z31, [x2, #31, MUL VL]
191+
192+
/* Get FFR base address */
193+
mov x4, #__z_arm64_fp_context_sve_ffr_OFFSET
194+
add x4, x0, x4
195+
196+
/* Restore FFR */
197+
ldr p0, [x4]
198+
wrffr p0.b
199+
200+
/* Get P registers base address */
201+
mov x3, #__z_arm64_fp_context_sve_p_regs_OFFSET
202+
add x3, x0, x3
203+
204+
/* Restore P registers intervals */
205+
ldr p0, [x3, #0, MUL VL]
206+
ldr p1, [x3, #1, MUL VL]
207+
ldr p2, [x3, #2, MUL VL]
208+
ldr p3, [x3, #3, MUL VL]
209+
ldr p4, [x3, #4, MUL VL]
210+
ldr p5, [x3, #5, MUL VL]
211+
ldr p6, [x3, #6, MUL VL]
212+
ldr p7, [x3, #7, MUL VL]
213+
ldr p8, [x3, #8, MUL VL]
214+
ldr p9, [x3, #9, MUL VL]
215+
ldr p10, [x3, #10, MUL VL]
216+
ldr p11, [x3, #11, MUL VL]
217+
ldr p12, [x3, #12, MUL VL]
218+
ldr p13, [x3, #13, MUL VL]
219+
ldr p14, [x3, #14, MUL VL]
220+
ldr p15, [x3, #15, MUL VL]
221+
222+
/* Restore control registers */
223+
ldr w2, [x0, #__z_arm64_fp_context_fpsr_OFFSET]
224+
ldr w3, [x0, #__z_arm64_fp_context_fpcr_OFFSET]
225+
msr fpsr, x2
226+
msr fpcr, x3
64227

65228
ret
229+
230+
#endif /* CONFIG_ARM64_SVE */

0 commit comments

Comments
 (0)