diff --git a/arch/Kconfig b/arch/Kconfig index 225f39d73c19b..1cd6172c84f92 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1031,7 +1031,7 @@ config FPU config FPU_SHARING bool "FPU register sharing" - depends on FPU && MULTITHREADING + depends on FPU help This option enables preservation of the hardware floating point registers across context switches to allow multiple threads to perform concurrent diff --git a/arch/arm/core/Kconfig b/arch/arm/core/Kconfig index 5505eee4e19b4..125e8e4bf954f 100644 --- a/arch/arm/core/Kconfig +++ b/arch/arm/core/Kconfig @@ -6,11 +6,11 @@ config CPU_CORTEX_M bool select CPU_CORTEX - select ARCH_HAS_CUSTOM_SWAP_TO_MAIN + select ARCH_HAS_CUSTOM_SWAP_TO_MAIN if !USE_SWITCH select HAS_CMSIS_CORE select HAS_FLASH_LOAD_OFFSET select ARCH_HAS_SINGLE_THREAD_SUPPORT - select ARCH_HAS_THREAD_ABORT + select ARCH_HAS_THREAD_ABORT if !USE_SWITCH select ARCH_HAS_TRUSTED_EXECUTION if ARM_TRUSTZONE_M select ARCH_HAS_STACK_PROTECTION if (ARM_MPU && !ARMV6_M_ARMV8_M_BASELINE) || CPU_CORTEX_M_HAS_SPLIM select ARCH_HAS_USERSPACE if ARM_MPU diff --git a/arch/arm/core/cortex_m/CMakeLists.txt b/arch/arm/core/cortex_m/CMakeLists.txt index 3de8e544bc121..f9435ebcf9394 100644 --- a/arch/arm/core/cortex_m/CMakeLists.txt +++ b/arch/arm/core/cortex_m/CMakeLists.txt @@ -20,21 +20,22 @@ elseif(CONFIG_ARMV8_1_M_PACBTI_NONE) endif() zephyr_library_sources( - exc_exit.c fault.c fault_s.S fpu.c reset.S scb.c - thread_abort.c vector_table.S - swap_helper.S + svc.S irq_manage.c prep_c.c thread.c cpu_idle.c ) +zephyr_library_sources_ifdef(CONFIG_USE_SWITCH arm-m-switch.c) +zephyr_library_sources_ifndef(CONFIG_USE_SWITCH exc_exit.c thread_abort.c swap_helper.S) + zephyr_library_sources_ifndef(CONFIG_ARM_CUSTOM_INTERRUPT_CONTROLLER irq_init.c) zephyr_library_sources_ifdef(CONFIG_GEN_SW_ISR_TABLE isr_wrapper.c) zephyr_library_sources_ifdef(CONFIG_DEBUG_COREDUMP coredump.c) diff --git a/arch/arm/core/cortex_m/Kconfig b/arch/arm/core/cortex_m/Kconfig index cb7f134598f3a..b506249f40733 100644 --- a/arch/arm/core/cortex_m/Kconfig +++ b/arch/arm/core/cortex_m/Kconfig @@ -106,6 +106,9 @@ config CPU_CORTEX_M7 if CPU_CORTEX_M +config USE_SWITCH + default y if USE_SWITCH_SUPPORTED && !ARM_NONSECURE_FIRMWARE + config CPU_CORTEX_M_HAS_SYSTICK bool help @@ -226,6 +229,7 @@ config ARMV7_M_ARMV8_M_MAINLINE select CPU_CORTEX_M_HAS_VTOR select CPU_CORTEX_M_HAS_PROGRAMMABLE_FAULT_PRIOS select CPU_CORTEX_M_HAS_SYSTICK + select USE_SWITCH_SUPPORTED help This option signifies the use of an ARMv7-M processor implementation, or the use of a backwards-compatible @@ -555,6 +559,15 @@ config CORTEX_M_NULL_POINTER_EXCEPTION_PAGE_SIZE endif # CORTEX_M_NULL_POINTER_EXCEPTION +config ARM_GCC_FP_WORKAROUND + bool "GCC/gas frame pointer workaround" + help + Enables a two-instruction workaround for a GNU toolchain + buglet in arch_switch() when the kernel must unavoidably be + compiled with -fno-omit-frame-pointer. Most code doesn't + want this, but in at least one case it's necessary. See + comment in arm-m-switch.h + rsource "tz/Kconfig" endif # CPU_CORTEX_M diff --git a/arch/arm/core/cortex_m/arm-m-switch.c b/arch/arm/core/cortex_m/arm-m-switch.c new file mode 100644 index 0000000000000..7040907d48f5c --- /dev/null +++ b/arch/arm/core/cortex_m/arm-m-switch.c @@ -0,0 +1,610 @@ +/* Copyright 2025 The ChromiumOS Authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include +#include + +/* The basic exception frame, popped by the hardware during return */ +struct hw_frame_base { + uint32_t r0, r1, r2, r3; + uint32_t r12; + uint32_t lr; + uint32_t pc; + uint32_t apsr; +}; + +/* The hardware frame pushed when entry is taken with FPU active */ +struct hw_frame_fpu { + struct hw_frame_base base; + uint32_t s_regs[16]; + uint32_t fpscr; + uint32_t reserved; +}; + +/* The hardware frame pushed when entry happens with a misaligned stack */ +struct hw_frame_align { + struct hw_frame_base base; + uint32_t align_pad; +}; + +/* Both of the above */ +struct hw_frame_align_fpu { + struct hw_frame_fpu base; + uint32_t align_pad; +}; + +/* Zephyr's synthesized frame used during context switch on interrupt + * exit. It's a minimal hardware frame plus storage for r4-11. + */ +struct synth_frame { + uint32_t r7, r8, r9, r10, r11; + uint32_t r4, r5, r6; /* these match switch format */ + struct hw_frame_base base; +}; + +/* Zephyr's custom frame used for suspended threads, not hw-compatible */ +struct switch_frame { +#ifdef CONFIG_BUILTIN_STACK_GUARD + uint32_t psplim; +#endif + uint32_t apsr; + uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12; + uint32_t lr; + uint32_t pc; +}; + +/* Union of synth and switch frame, used during context switch */ +union u_frame { + struct { + char pad[sizeof(struct switch_frame) - sizeof(struct synth_frame)]; + struct synth_frame hw; + }; + struct switch_frame sw; +}; + +/* u_frame with have_fpu flag prepended (zero value), no FPU state */ +struct z_frame { +#ifdef CONFIG_FPU + uint32_t have_fpu; +#endif + union u_frame u; +}; + +/* u_frame + FPU data, with have_fpu (non-zero) */ +struct z_frame_fpu { + uint32_t have_fpu; + uint32_t s_regs[32]; + uint32_t fpscr; + union u_frame u; +}; + +/* Union of all possible stack frame formats, aligned at the top (!). + * Note that FRAMESZ is constructed to be larger than any of them to + * avoid having a zero-length array. The code doesn't ever use the + * size of this struct, it just wants to be have compiler-visible + * offsets for in-place copies. + */ +/* clang-format off */ +#define FRAMESZ (4 + MAX(sizeof(struct z_frame_fpu), sizeof(struct hw_frame_align_fpu))) +#define PAD(T) char pad_##T[FRAMESZ - sizeof(struct T)] +union frame { + struct { PAD(hw_frame_base); struct hw_frame_base hw; }; + struct { PAD(hw_frame_fpu); struct hw_frame_fpu hwfp; }; + struct { PAD(hw_frame_align); struct hw_frame_align hw_a; }; + struct { PAD(hw_frame_align_fpu); struct hw_frame_align_fpu hwfp_a; }; + struct { PAD(z_frame); struct z_frame z; }; + struct { PAD(z_frame_fpu); struct z_frame_fpu zfp; }; +}; +/* clang-format on */ + +#ifdef __GNUC__ +/* Validate the structs are correctly top-aligned */ +#define FRAME_FIELD_END(F) ((void *)&(&(((union frame *)0)->F))[1]) +BUILD_ASSERT(FRAME_FIELD_END(hw) == FRAME_FIELD_END(hwfp)); +BUILD_ASSERT(FRAME_FIELD_END(hw) == FRAME_FIELD_END(hw_a)); +BUILD_ASSERT(FRAME_FIELD_END(hw) == FRAME_FIELD_END(hwfp_a)); +BUILD_ASSERT(FRAME_FIELD_END(hw) == FRAME_FIELD_END(z)); +BUILD_ASSERT(FRAME_FIELD_END(hw) == FRAME_FIELD_END(zfp)); +#endif + +#ifdef CONFIG_FPU +uint32_t arm_m_switch_stack_buffer = sizeof(struct z_frame_fpu) - sizeof(struct hw_frame_base); +#else +uint32_t arm_m_switch_stack_buffer = sizeof(struct z_frame) - sizeof(struct hw_frame_base); +#endif + +struct arm_m_cs_ptrs arm_m_cs_ptrs; + +#ifdef CONFIG_LTO +/* Toolchain workaround: when building with LTO, gcc seems unable to + * notice the external references in the assembly for arm_m_exc_exit + * below, and drops the symbols before the final link. Use this + * global to store pointers in arm_m_new_stack(), wasting a few bytes + * of code & data. + */ +void *arm_m_lto_refs[2]; +#endif + +/* Unit test hook, unused in production */ +void *arm_m_last_switch_handle; + +/* Global holder for the location of the saved LR in the entry frame. */ +uint32_t *arm_m_exc_lr_ptr; + +/* Dummy used in arch_switch() when USERSPACE=y */ +uint32_t arm_m_switch_control; + +/* clang-format off */ +/* Emits an in-place copy from a hw_frame_base to a switch_frame */ +#define HW_TO_SWITCH(hw, sw) do { \ + uint32_t r0 = hw.r0, r1 = hw.r1, r2 = hw.r2, r3 = hw.r3; \ + uint32_t r12 = hw.r12, lr = hw.lr, pc = hw.pc, apsr = hw.apsr; \ + pc |= 1; /* thumb bit! */ \ + sw.r0 = r0; sw.r1 = r1; sw.r2 = r2; sw.r3 = r3; \ + sw.r12 = r12; sw.lr = lr; sw.pc = pc; sw.apsr = apsr; \ +} while (false) + +/* Emits an in-place copy from a switch_frame to a synth_frame */ +#define SWITCH_TO_SYNTH(sw, syn) do { \ + struct synth_frame syntmp = { \ + .r4 = sw.r4, .r5 = sw.r5, .r6 = sw.r6, .r7 = sw.r7, \ + .r8 = sw.r8, .r9 = sw.r9, .r10 = sw.r10, .r11 = sw.r11, \ + .base.r0 = sw.r0, .base.r1 = sw.r1, .base.r2 = sw.r2, \ + .base.r3 = sw.r3, .base.r12 = sw.r12, .base.lr = sw.lr, \ + .base.pc = sw.pc, .base.apsr = sw.apsr, \ + }; \ + syn = syntmp; \ +} while (false) +/* clang-format on */ + +/* The arch/cpu/toolchain are horrifyingly inconsistent with how the + * thumb bit is treated in runtime addresses. The PC target for a B + * instruction must have it set. The PC pushed from an exception has + * is unset. The linker puts functions at even addresses, obviously, + * but the symbol address exposed at runtime has it set. Exception + * return ignores it. Use this to avoid insanity. + */ +static bool pc_match(uint32_t pc, void *addr) +{ + return ((pc ^ (uint32_t) addr) & ~1) == 0; +} + +/* Reports if the passed return address is a valid EXC_RETURN (high + * four bits set) that will restore to the PSP running in thread mode + * (low four bits == 0xd). That is an interrupted Zephyr thread + * context. For everything else, we just return directly via the + * hardware-pushed stack frame with no special handling. See ARMv7M + * manual B1.5.8. + */ +static bool is_thread_return(uint32_t lr) +{ + return (lr & 0xf000000f) == 0xf000000d; +} + +/* Returns true if the EXC_RETURN address indicates a FPU subframe was + * pushed to the stack. See ARMv7M manual B1.5.8. + */ +static bool fpu_state_pushed(uint32_t lr) +{ + return IS_ENABLED(CONFIG_CPU_HAS_FPU) ? !(lr & 0x10) : false; +} + +/* ICI/IT instruction fault workarounds + * + * ARM Cortex M has what amounts to a design bug. The architecture + * inherits several unpipelined/microcoded "ICI/IT" instruction forms + * that take many cycles to complete (LDM/STM and the Thumb "IT" + * conditional frame are the big ones). But out of a desire to + * minimize interrupt latency, the CPU is allowed to halt and resume + * these instructions mid-flight while they are partially completed. + * The relevant bits of state are stored in the EPSR fields of the + * xPSR register (see ARMv7-M manual B1.4.2). But (and this is the + * design bug) those bits CANNOT BE WRITTEN BY SOFTWARE. They can + * only be modified by exception return. + * + * This means that if a Zephyr thread takes an interrupt + * mid-ICI/IT-instruction, then switches to another thread on exit, + * and then that thread is resumed by a cooperative switch and not an + * interrupt, the instruction will lose the state and restart from + * scratch. For LDM/STM that's generally idempotent for memory (but + * not MMIO!), but for IT that means that the restart will re-execute + * arbitrary instructions that may not be idempotent (e.g. "addeq r0, + * r0, #1" can't be done twice, because you would add two to r0!) + * + * The fix is to check for this condition (which is very rare) on + * interrupt exit when we are switching, and if we discover we've + * interrupted such an instruction we swap the return address with a + * trampoline that uses a UDF instruction to immediately trap to the + * undefined instruction handler, which then recognizes the fixup + * address as special and immediately returns back into the thread + * with the correct EPSR value and resume PC (which have been stashed + * in the thread struct). The overhead for the normal case is just a + * few cycles for the test. + */ +__attribute__((naked)) void arm_m_iciit_stub(void) +{ + __asm__("udf 0;"); +} + +/* Called out of interrupt entry to test for an interrupted instruction */ +static void iciit_fixup(struct k_thread *th, struct hw_frame_base *hw, uint32_t xpsr) +{ +#ifdef CONFIG_MULTITHREADING + if ((xpsr & 0x0600fc00) != 0) { + /* Stash original return address, replace with hook */ + th->arch.iciit_pc = hw->pc; + th->arch.iciit_apsr = hw->apsr; + hw->pc = (uint32_t)arm_m_iciit_stub; + } +#endif +} + +/* Called out of fault handler from the UDF after an arch_switch() */ +bool arm_m_iciit_check(uint32_t msp, uint32_t psp, uint32_t lr) +{ + struct hw_frame_base *f = (void *)psp; + + /* Look for undefined instruction faults from our stub */ + if (pc_match(f->pc, arm_m_iciit_stub)) { + if (is_thread_return(lr)) { + f->pc = _current->arch.iciit_pc; + f->apsr = _current->arch.iciit_apsr; + _current->arch.iciit_pc = 0; + return true; + } + } + return false; +} + +#ifdef CONFIG_BUILTIN_STACK_GUARD +#define PSPLIM(f) ((f)->z.u.sw.psplim) +#else +#define PSPLIM(f) 0 +#endif + +/* Converts, in place, a pickled "switch" frame from a suspended + * thread to a "synthesized" format that can be restored by the CPU + * hardware on exception exit. + */ +static void *arm_m_switch_to_cpu(void *sp) +{ + union frame *f; + uint32_t splim; + +#ifdef CONFIG_FPU + /* When FPU switching is enabled, the suspended handle always + * points to the have_fpu word, which will be followed by FPU + * state if non-zero. + */ + bool have_fpu = (*(uint32_t *)sp) != 0; + + if (have_fpu) { + f = CONTAINER_OF(sp, union frame, zfp.have_fpu); + splim = PSPLIM(f); + __asm__ volatile("vldm %0, {s0-s31}" ::"r"(&f->zfp.s_regs[0])); + SWITCH_TO_SYNTH(f->zfp.u.sw, f->zfp.u.hw); + } else { + f = CONTAINER_OF(sp, union frame, z.have_fpu); + splim = PSPLIM(f); + SWITCH_TO_SYNTH(f->z.u.sw, f->zfp.u.hw); + } +#else + f = CONTAINER_OF(sp, union frame, z.u.sw); + splim = PSPLIM(f); + SWITCH_TO_SYNTH(f->z.u.sw, f->z.u.hw); +#endif + +#ifdef CONFIG_BUILTIN_STACK_GUARD + __asm__ volatile("msr psplim, %0" ::"r"(splim)); +#endif + + /* Mark the callee-saved pointer for the fixup assembly. Note + * funny layout that puts r7 first! + */ + arm_m_cs_ptrs.in = &f->z.u.hw.r7; + + return &f->z.u.hw.base; +} + +static void fpu_cs_copy(struct hw_frame_fpu *src, struct z_frame_fpu *dst) +{ + for (int i = 0; IS_ENABLED(CONFIG_FPU) && i < 16; i++) { + dst->s_regs[i] = src->s_regs[i]; + } +} + +/* Converts, in-place, a CPU-spilled ("hardware") exception entry + * frame to our ("zephyr") switch handle format such that the thread + * can be suspended + */ +static void *arm_m_cpu_to_switch(struct k_thread *th, void *sp, bool fpu) +{ + union frame *f = NULL; + struct hw_frame_base *base = sp; + bool padded = (base->apsr & 0x200); + uint32_t fpscr; + + if (fpu && IS_ENABLED(CONFIG_FPU)) { + uint32_t dummy = 0; + + /* Lazy FPU stacking is enabled, so before we touch + * the stack frame we have to tickle the FPU to force + * it to spill the caller-save registers. Then clear + * CONTROL.FPCA which gets set again by that instruction. + */ + __asm__ volatile("vmov %0, s0;" + "mrs %0, control;" + "bic %0, %0, #4;" + "msr control, %0;" ::"r"(dummy)); + } + + /* Detects interrupted ICI/IT instructions and rigs up thread + * to trap the next time it runs + */ + iciit_fixup(th, base, base->apsr); + + if (IS_ENABLED(CONFIG_FPU) && fpu) { + fpscr = CONTAINER_OF(sp, struct hw_frame_fpu, base)->fpscr; + } + + /* There are four (!) different offsets from the interrupted + * stack at which the hardware frame might be found at + * runtime. These expansions let the compiler generate + * optimized in-place copies for each. In practice it does a + * pretty good job, much better than a double-copy via an + * intermediate buffer. Note that when FPU state is spilled + * we must copy the 16 spilled registers first, to make room + * for the copy. + */ + if (!fpu && !padded) { + f = CONTAINER_OF(sp, union frame, hw.r0); + HW_TO_SWITCH(f->hw, f->z.u.sw); + } else if (!fpu && padded) { + f = CONTAINER_OF(sp, union frame, hw_a.base.r0); + HW_TO_SWITCH(f->hw_a.base, f->z.u.sw); + } else if (fpu && !padded) { + f = CONTAINER_OF(sp, union frame, hwfp.base.r0); + fpu_cs_copy(&f->hwfp, &f->zfp); + HW_TO_SWITCH(f->hwfp.base, f->z.u.sw); + } else if (fpu && padded) { + f = CONTAINER_OF(sp, union frame, hwfp_a.base.base.r0); + fpu_cs_copy(&f->hwfp_a.base, &f->zfp); + HW_TO_SWITCH(f->hwfp_a.base.base, f->z.u.sw); + } + +#ifdef CONFIG_BUILTIN_STACK_GUARD + __asm__ volatile("mrs %0, psplim" : "=r"(f->z.u.sw.psplim)); +#endif + + /* Mark the callee-saved pointer for the fixup assembly */ + arm_m_cs_ptrs.out = &f->z.u.sw.r4; + +#ifdef CONFIG_FPU + if (fpu) { + __asm__ volatile("vstm %0, {s16-s31}" ::"r"(&f->zfp.s_regs[16])); + f->zfp.fpscr = fpscr; + f->zfp.have_fpu = true; + return &f->zfp.have_fpu; + } + f->z.have_fpu = false; + return &f->z.have_fpu; +#else + return &f->z.u.sw; +#endif +} + +void *arm_m_new_stack(char *base, uint32_t sz, void *entry, void *arg0, void *arg1, void *arg2, + void *arg3) +{ + struct switch_frame *sw; + uint32_t baddr; + +#ifdef CONFIG_LTO + arm_m_lto_refs[0] = &arm_m_cs_ptrs; + arm_m_lto_refs[1] = arm_m_must_switch; +#endif + +#ifdef CONFIG_MULTITHREADING + /* Kludgey global initialization, stash computed pointers to + * the LR frame location and fixup address into these + * variables for use by arm_m_exc_tail(). Should move to arch + * init somewhere once arch_switch is better integrated + */ + char *stack = (char *)K_KERNEL_STACK_BUFFER(z_interrupt_stacks[0]); + uint32_t *s_top = (uint32_t *)(stack + K_KERNEL_STACK_SIZEOF(z_interrupt_stacks[0])); + + arm_m_exc_lr_ptr = &s_top[-1]; + arm_m_cs_ptrs.lr_fixup = (void *)(1 | (uint32_t)arm_m_exc_exit); /* thumb bit! */ +#endif + + baddr = ((uint32_t)base + 7) & ~7; + + sz = ((uint32_t)(base + sz) - baddr) & ~7; + + if (sz < sizeof(struct switch_frame)) { + return NULL; + } + + /* Note: a useful trick here would be to initialize LR to + * point to cleanup code, avoiding the need for the + * z_thread_entry wrapper, saving a few words of stack frame + * and a few cycles on thread entry. + */ + sw = (void *)(baddr + sz - sizeof(*sw)); + *sw = (struct switch_frame){ + IF_ENABLED(CONFIG_BUILTIN_STACK_GUARD, (.psplim = baddr,)) .r0 = (uint32_t)arg0, + .r1 = (uint32_t)arg1, .r2 = (uint32_t)arg2, .r3 = (uint32_t)arg3, + .pc = ((uint32_t)entry) | 1, /* set thumb bit! */ + .apsr = 0x1000000, /* thumb bit here too! */ + }; + +#ifdef CONFIG_FPU + struct z_frame *zf = CONTAINER_OF(sw, struct z_frame, u.sw); + + zf->have_fpu = false; + return zf; +#else + return sw; +#endif +} + +bool arm_m_do_switch(struct k_thread *last_thread, void *next); + +bool arm_m_must_switch(void) +{ + /* This lock is held until the end of the context switch, at + * which point it will be dropped unconditionally. Save a few + * cycles by skipping the needless bits of arch_irq_lock(). + */ + uint32_t pri = _EXC_IRQ_DEFAULT_PRIO; + + __asm__ volatile("msr basepri, %0" ::"r"(pri)); + + /* Secure mode transistions can push a non-thread frame to the + * stack. If not enabled, we already know by construction + * that we're handling the bottom level of the interrupt stack + * and returning to thread mode. + */ + if ((IS_ENABLED(CONFIG_ARM_SECURE_FIRMWARE) || IS_ENABLED(CONFIG_ARM_NONSECURE_FIRMWARE)) && + !is_thread_return((uint32_t)arm_m_cs_ptrs.lr_save)) { + return false; + } + + struct k_thread *last_thread = _current; + void *next = z_sched_next_handle(last_thread); + + if (next == NULL) { + return false; + } + + arm_m_do_switch(last_thread, next); + return true; +} + +bool arm_m_do_switch(struct k_thread *last_thread, void *next) +{ + void *last; + bool fpu = fpu_state_pushed((uint32_t)arm_m_cs_ptrs.lr_save); + + __asm__ volatile("mrs %0, psp" : "=r"(last)); + +#ifdef CONFIG_USERSPACE + /* Update CONTROL register's nPRIV bit to reflect user/syscall + * thread state of the incoming thread. + */ + extern char z_syscall_exit_race1, z_syscall_exit_race2; + struct hw_frame_base *f = last; + uint32_t control; + + /* Note that the privilege state is stored in the CPU *AND* in + * the "mode" field of the thread struct. This creates an + * unavoidable race because the syscall exit code can't + * atomically release the lock and set CONTROL. We detect the + * case where a thread is interrupted at exactly that moment + * and manually skip over the MSR instruction (which would + * otherwise fault on resume because "mode" says that the + * thread is unprivileged!). A future rework should just + * pickle the CPU state found in the exception to the frame + * and not mess with thread state from within the syscall. + */ + if (pc_match(f->pc, &z_syscall_exit_race1)) { + f->pc = (uint32_t) &z_syscall_exit_race2; + } + + __asm__ volatile("mrs %0, control" : "=r"(control)); + control = (control & ~1) | (_current->arch.mode & 1); + __asm__ volatile("msr control, %0" ::"r"(control)); +#endif + + last = arm_m_cpu_to_switch(last_thread, last, fpu); + next = arm_m_switch_to_cpu(next); + __asm__ volatile("msr psp, %0" ::"r"(next)); + + /* Undo a UDF fixup applied at interrupt time, no need: we're + * restoring EPSR via interrupt. + */ + if (_current->arch.iciit_pc) { + struct hw_frame_base *n = next; + + n->pc = _current->arch.iciit_pc; + n->apsr = _current->arch.iciit_apsr; + _current->arch.iciit_pc = 0; + } + +#if !defined(CONFIG_MULTITHREADING) + arm_m_last_switch_handle = last; +#elif defined(CONFIG_USE_SWITCH) + last_thread->switch_handle = last; +#endif + +#if defined(CONFIG_USERSPACE) || defined(CONFIG_MPU_STACK_GUARD) + z_arm_configure_dynamic_mpu_regions(_current); +#endif + +#ifdef CONFIG_THREAD_LOCAL_STORAGE + z_arm_tls_ptr = _current->tls; +#endif + +#if defined(CONFIG_BOARD_MPS3_CORSTONE310_FVP) || \ + defined(CONFIG_BOARD_MPS4_CORSTONE315_FVP) || \ + defined(CONFIG_BOARD_MPS4_CORSTONE320_FVP) + /* The ARM Ltd. FVP emulator (at least the ones above that run + * in Zephyr CI) appears to have a bug with the stack + * alignment bit in xPSR. It's common (it fails in the first + * 4-6 timer interrupts in tests.syscalls.timeslicing) that + * we'll take an interrupt from a seemingly aligned (!) stack + * with the bit set. If we then switch and resume the thread + * from a different context later, popping the stack goes + * wrong (more so than just a misalignment of four bytes: I + * usually see it too low by 20 bytes) in a way that it + * doesn't if we return synchronously. Presumably legacy + * PendSV didn't see this because it used the unmodified + * exception frame. + * + * Work around this here, pending validation from ARM, by + * simply assuming all interrupted stacks were aligned and + * clearing the bit. That is NOT correct in the general case, + * but in practice it's enough to get tests to pass. + */ + ((struct hw_frame_base *)next)->apsr &= ~BIT(9); +#endif + + return true; +} + +/* This is handled an inline now for C code, but there are a few spots + * that need to get to it from assembly (but which IMHO should really + * be ported to C) + */ +void arm_m_legacy_exit(void) +{ + arm_m_exc_tail(); +} + +/* We arrive here on return to thread code from exception handlers. + * We know that r4-r11 of the interrupted thread have been restored + * (other registers will be forgotten and can be clobbered). First + * call arm_m_must_switch() (which handles the other context switch + * duties), and spill/fill if necessary. If no context switch is + * needed, we just return via the original LR. If we are switching, + * we synthesize a integer-only EXC_RETURN as FPU state switching was + * handled in software already. + */ +#ifdef CONFIG_MULTITHREADING +__attribute__((naked)) void arm_m_exc_exit(void) +{ + __asm__(" bl arm_m_must_switch;" + " ldr r2, =arm_m_cs_ptrs;" + " mov r3, #0;" + " ldr lr, [r2, #8];" /* lr_save */ + " cbz r0, 1f;" + " mov lr, #0xfffffffd;" /* integer-only LR */ + " ldm r2, {r0, r1};" /* fields: out, in */ + " stm r0, {r4-r11};" /* out is a switch_frame */ + " ldm r1!, {r7-r11};" /* in is a synth_frame */ + " ldm r1, {r4-r6};" + "1:\n" + " msr basepri, r3;" /* release lock taken in must_switch */ + " bx lr;"); +} +#endif diff --git a/arch/arm/core/cortex_m/coredump.c b/arch/arm/core/cortex_m/coredump.c index 787cfcdf4a751..34ba08f60df9e 100644 --- a/arch/arm/core/cortex_m/coredump.c +++ b/arch/arm/core/cortex_m/coredump.c @@ -100,5 +100,12 @@ uint16_t arch_coredump_tgt_code_get(void) uintptr_t arch_coredump_stack_ptr_get(const struct k_thread *thread) { - return (thread == _current) ? z_arm_coredump_fault_sp : thread->callee_saved.psp; + if (thread == _current) { + return z_arm_coredump_fault_sp; + } +#ifdef CONFIG_USE_SWITCH + return (uintptr_t)thread->switch_handle; +#else + return thread->callee_saved.psp; +#endif } diff --git a/arch/arm/core/cortex_m/fault.c b/arch/arm/core/cortex_m/fault.c index 204b403535cd0..f6f3a68719b4c 100644 --- a/arch/arm/core/cortex_m/fault.c +++ b/arch/arm/core/cortex_m/fault.c @@ -1029,6 +1029,13 @@ void z_arm_fault(uint32_t msp, uint32_t psp, uint32_t exc_return, _callee_saved_ bool recoverable, nested_exc; struct arch_esf *esf; +#ifdef CONFIG_USE_SWITCH + /* Handle the stub fault to restore interrupted ICI/IT instructions */ + if (arm_m_iciit_check(msp, psp, exc_return)) { + return; + } +#endif + /* Create a stack-ed copy of the ESF to be used during * the fault handling process. */ @@ -1081,6 +1088,31 @@ void z_arm_fault(uint32_t msp, uint32_t psp, uint32_t exc_return, _callee_saved_ } z_arm_fatal_error(reason, &esf_copy); + +#if defined(CONFIG_USE_SWITCH) + /* The arch_switch implementation can expand the size of the + * frame when it saves the outgoing context (to the faulting + * thread), so make sure it has room. The resulting stack + * will obviously be corrupt, but it won't overflow the + * guarded region. + */ +#if defined(CONFIG_BUILTIN_STACK_GUARD) + uint32_t psplim, psp2; + + __asm__ volatile("mrs %0, psplim" : "=r"(psplim)); + psp2 = MAX(psplim + arm_m_switch_stack_buffer, psp); + __asm__ volatile("msr psp, %0" ::"r"(psp2)); +#endif + + /* Prepare interrupt exit for context switch, but only if this + * is the lowest level interrupt (if we're nested, that was a + * fault that happened in the ISR and shouldn't affect thread + * state). + */ + if ((exc_return & 0xf000000f) == 0xf000000d) { + arm_m_exc_tail(); + } +#endif } /** diff --git a/arch/arm/core/cortex_m/fault_s.S b/arch/arm/core/cortex_m/fault_s.S index 43d8eaa34dd56..8b2e30d71af97 100644 --- a/arch/arm/core/cortex_m/fault_s.S +++ b/arch/arm/core/cortex_m/fault_s.S @@ -80,6 +80,7 @@ SECTION_SUBSEC_FUNC(TEXT,__fault,z_arm_exc_spurious) mrs r0, MSP mrs r1, PSP push {r0, lr} + #if defined(CONFIG_EXTRA_EXCEPTION_INFO) /* Build _callee_saved_t. To match the struct * definition we push the psp & then r11-r4 @@ -109,5 +110,4 @@ SECTION_SUBSEC_FUNC(TEXT,__fault,z_arm_exc_spurious) add sp, #40 #endif pop {r0, pc} - .end diff --git a/arch/arm/core/cortex_m/svc.S b/arch/arm/core/cortex_m/svc.S new file mode 100644 index 0000000000000..3676e9dd580d2 --- /dev/null +++ b/arch/arm/core/cortex_m/svc.S @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2013-2014 Wind River Systems, Inc. + * Copyright (c) 2017-2019 Nordic Semiconductor ASA. + * Copyright (c) 2020 Stephanos Ioannidis + * + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @file + * @brief SVC exception handling for ARM Cortex-M + * + * This module implements the z_arm_svc exception vector on ARM Cortex-M CPUs. + */ + +#include +#include +#include +#include + +_ASM_FILE_PROLOGUE + +GTEXT(z_arm_svc) +GTEXT(z_do_kernel_oops) +#if defined(CONFIG_USERSPACE) +GTEXT(z_arm_do_syscall) +#endif + +GDATA(_kernel) + +/** + * + * @brief Service call handler + * + * The service call (svc) is used in the following occasions: + * - IRQ offloading + * - Kernel run-time exceptions + * - System Calls (User mode) + * + */ +SECTION_FUNC(TEXT, z_arm_svc) + /* Use EXC_RETURN state to find out if stack frame is on the + * MSP or PSP + */ +#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) + movs r0, #_EXC_RETURN_SPSEL_Msk + mov r1, lr + tst r1, r0 + beq .L_stack_frame_msp + mrs r0, PSP + bne .L_stack_frame_endif +.L_stack_frame_msp: + mrs r0, MSP +.L_stack_frame_endif: +#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) + tst lr, #_EXC_RETURN_SPSEL_Msk /* did we come from thread mode ? */ + ite eq /* if zero (equal), came from handler mode */ + mrseq r0, MSP /* handler mode, stack frame is on MSP */ + mrsne r0, PSP /* thread mode, stack frame is on PSP */ +#endif + + + /* Figure out what SVC call number was invoked */ + + ldr r1, [r0, #24] /* grab address of PC from stack frame */ + /* SVC is a two-byte instruction, point to it and read the + * SVC number (lower byte of SCV instruction) + */ +#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) + subs r1, r1, #2 + ldrb r1, [r1] +#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) + ldrb r1, [r1, #-2] +#endif + + /* + * grab service call number: + * 0: Unused + * 1: irq_offload (if configured) + * 2: kernel panic or oops (software generated fatal exception) + * 3: System call (if user mode supported) + */ +#if defined(CONFIG_USERSPACE) + mrs r2, CONTROL + + cmp r1, #3 + beq .L_do_syscall + + /* + * check that we are privileged before invoking other SVCs + * oops if we are unprivileged + */ +#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) + movs r3, #0x1 + tst r2, r3 +#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) + tst r2, #0x1 +#endif + bne .L_oops + +#endif /* CONFIG_USERSPACE */ + + cmp r1, #2 + beq .L_oops + +#if defined(CONFIG_IRQ_OFFLOAD) + push {r0, lr} + bl z_irq_do_offload /* call C routine which executes the offload */ +#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) + pop {r0, r3} + mov lr, r3 +#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) + pop {r0, lr} +#endif + +#ifdef CONFIG_USE_SWITCH + /* FIXME: this uses the pre-hook LR, but we want to refetch the + * one in the stack frame, otherwise irq_offload() won't context + * switch correctly (which... isn't tested?) + */ + push {r0, lr} + bl arm_m_legacy_exit + pop {r0, lr} + bx lr +#else + ldr r0, =z_arm_int_exit + bx r0 +#endif + +#endif + +.L_oops: + push {r0, lr} +#if defined(CONFIG_EXTRA_EXCEPTION_INFO) +#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) + /* Build _callee_saved_t. To match the struct + * definition we push the psp & then r11-r4 + */ + mrs r1, PSP + push {r1, r2} + push {r4-r11} + mov r1, sp /* pointer to _callee_saved_t */ +#endif /* CONFIG_ARMV7_M_ARMV8_M_MAINLINE */ +#endif /* CONFIG_EXTRA_EXCEPTION_INFO */ + mov r2, lr /* EXC_RETURN */ + bl z_do_kernel_oops + /* return from SVC exception is done here */ +#if defined(CONFIG_EXTRA_EXCEPTION_INFO) +#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) + /* We do not need to restore any register state here + * because we did not use any callee-saved registers + * in this routine. Therefore, we can just reset + * the MSP to its value prior to entering the function + */ + add sp, #40 +#endif /* CONFIG_ARMV7_M_ARMV8_M_MAINLINE */ +#endif /* CONFIG_EXTRA_EXCEPTION_INFO */ + +#ifdef CONFIG_USE_SWITCH + bl arm_m_legacy_exit +#endif + + pop {r0, pc} + +#if defined(CONFIG_USERSPACE) + /* + * System call will setup a jump to the z_arm_do_syscall() function + * when the SVC returns via the bx lr. + * + * There is some trickery involved here because we have to preserve + * the original PC value so that we can return back to the caller of + * the SVC. + * + * On SVC exeption, the stack looks like the following: + * r0 - r1 - r2 - r3 - r12 - LR - PC - PSR + * + * Registers look like: + * r0 - arg1 + * r1 - arg2 + * r2 - arg3 + * r3 - arg4 + * r4 - arg5 + * r5 - arg6 + * r6 - call_id + * r8 - saved link register + */ +.L_do_syscall: +#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) + movs r3, #24 + ldr r1, [r0, r3] /* grab address of PC from stack frame */ + mov r8, r1 +#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) + ldr r8, [r0, #24] /* grab address of PC from stack frame */ +#endif + ldr r1, =z_arm_do_syscall +#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) + str r1, [r0, r3] /* overwrite the PC to point to z_arm_do_syscall */ +#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) + str r1, [r0, #24] /* overwrite the PC to point to z_arm_do_syscall */ +#endif + +#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) + ldr r3, =K_SYSCALL_LIMIT + cmp r6, r3 +#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) + /* validate syscall limit */ + ldr ip, =K_SYSCALL_LIMIT + cmp r6, ip +#endif + /* The supplied syscall_id must be lower than the limit + * (Requires unsigned integer comparison) + */ + blo .L_valid_syscall_id + + /* bad syscall id. Set arg1 to bad id and set call_id to SYSCALL_BAD */ + str r6, [r0] + ldr r6, =K_SYSCALL_BAD + + /* Bad syscalls treated as valid syscalls with ID K_SYSCALL_BAD. */ + +.L_valid_syscall_id: + ldr r0, =_kernel + ldr r0, [r0, #_kernel_offset_to_current] +#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) + mov ip, r2 + ldr r1, =_thread_offset_to_mode + ldr r3, [r0, r1] + movs r2, #1 + bics r3, r2 + /* Store (privileged) mode in thread's mode state variable */ + str r3, [r0, r1] + mov r2, ip + dsb + /* set mode to privileged, r2 still contains value from CONTROL */ + movs r3, #1 + bics r2, r3 +#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) + ldr r1, [r0, #_thread_offset_to_mode] + bic r1, #1 + /* Store (privileged) mode in thread's mode state variable */ + str r1, [r0, #_thread_offset_to_mode] + dsb + /* set mode to privileged, r2 still contains value from CONTROL */ + bic r2, #1 +#endif + msr CONTROL, r2 + + /* ISB is not strictly necessary here (stack pointer is not being + * touched), but it's recommended to avoid executing pre-fetched + * instructions with the previous privilege. + */ + isb + +#if defined(CONFIG_BUILTIN_STACK_GUARD) + /* Thread is now in privileged mode; after returning from SCVall it + * will use the default (user) stack before switching to the privileged + * stack to execute the system call. We need to protect the user stack + * against stack overflows until this stack transition. + */ + ldr r1, [r0, #_thread_offset_to_stack_info_start] /* stack_info.start */ + msr PSPLIM, r1 +#endif /* CONFIG_BUILTIN_STACK_GUARD */ + + /* Take lock before returning from svc */ +#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) + mov r0, #_EXC_IRQ_DEFAULT_PRIO + msr BASEPRI, r0 +#else + cpsid i +#endif + + /* return from SVC to the modified LR - z_arm_do_syscall */ + bx lr +#endif /* CONFIG_USERSPACE */ diff --git a/arch/arm/core/cortex_m/swap_helper.S b/arch/arm/core/cortex_m/swap_helper.S index 40f00fc005629..0aa729a91c336 100644 --- a/arch/arm/core/cortex_m/swap_helper.S +++ b/arch/arm/core/cortex_m/swap_helper.S @@ -15,21 +15,12 @@ */ #include -#include #include #include -#include -#include -#include _ASM_FILE_PROLOGUE -GTEXT(z_arm_svc) GTEXT(z_arm_pendsv) -GTEXT(z_do_kernel_oops) -#if defined(CONFIG_USERSPACE) -GTEXT(z_arm_do_syscall) -#endif GDATA(_kernel) @@ -346,225 +337,3 @@ SECTION_FUNC(TEXT, z_arm_pendsv) * Cortex-M: return from PendSV exception */ bx lr - -/** - * - * @brief Service call handler - * - * The service call (svc) is used in the following occasions: - * - IRQ offloading - * - Kernel run-time exceptions - * - System Calls (User mode) - * - */ -SECTION_FUNC(TEXT, z_arm_svc) - /* Use EXC_RETURN state to find out if stack frame is on the - * MSP or PSP - */ -#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) - movs r0, #_EXC_RETURN_SPSEL_Msk - mov r1, lr - tst r1, r0 - beq .L_stack_frame_msp - mrs r0, PSP - bne .L_stack_frame_endif -.L_stack_frame_msp: - mrs r0, MSP -.L_stack_frame_endif: -#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) - tst lr, #_EXC_RETURN_SPSEL_Msk /* did we come from thread mode ? */ - ite eq /* if zero (equal), came from handler mode */ - mrseq r0, MSP /* handler mode, stack frame is on MSP */ - mrsne r0, PSP /* thread mode, stack frame is on PSP */ -#endif - - - /* Figure out what SVC call number was invoked */ - - ldr r1, [r0, #24] /* grab address of PC from stack frame */ - /* SVC is a two-byte instruction, point to it and read the - * SVC number (lower byte of SCV instruction) - */ -#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) - subs r1, r1, #2 - ldrb r1, [r1] -#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) - ldrb r1, [r1, #-2] -#endif - - /* - * grab service call number: - * 0: Unused - * 1: irq_offload (if configured) - * 2: kernel panic or oops (software generated fatal exception) - * 3: System call (if user mode supported) - */ -#if defined(CONFIG_USERSPACE) - mrs r2, CONTROL - - cmp r1, #3 - beq .L_do_syscall - - /* - * check that we are privileged before invoking other SVCs - * oops if we are unprivileged - */ -#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) - movs r3, #0x1 - tst r2, r3 -#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) - tst r2, #0x1 -#endif - bne .L_oops - -#endif /* CONFIG_USERSPACE */ - - cmp r1, #2 - beq .L_oops - -#if defined(CONFIG_IRQ_OFFLOAD) - push {r0, lr} - bl z_irq_do_offload /* call C routine which executes the offload */ -#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) - pop {r0, r3} - mov lr, r3 -#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) - pop {r0, lr} -#endif - - /* exception return is done in z_arm_int_exit() */ - ldr r0, =z_arm_int_exit - bx r0 - -#endif - -.L_oops: - push {r0, lr} -#if defined(CONFIG_EXTRA_EXCEPTION_INFO) -#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) - /* Build _callee_saved_t. To match the struct - * definition we push the psp & then r11-r4 - */ - mrs r1, PSP - push {r1, r2} - push {r4-r11} - mov r1, sp /* pointer to _callee_saved_t */ -#endif /* CONFIG_ARMV7_M_ARMV8_M_MAINLINE */ -#endif /* CONFIG_EXTRA_EXCEPTION_INFO */ - mov r2, lr /* EXC_RETURN */ - bl z_do_kernel_oops - /* return from SVC exception is done here */ -#if defined(CONFIG_EXTRA_EXCEPTION_INFO) -#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) - /* We do not need to restore any register state here - * because we did not use any callee-saved registers - * in this routine. Therefore, we can just reset - * the MSP to its value prior to entering the function - */ - add sp, #40 -#endif /* CONFIG_ARMV7_M_ARMV8_M_MAINLINE */ -#endif /* CONFIG_EXTRA_EXCEPTION_INFO */ - pop {r0, pc} - -#if defined(CONFIG_USERSPACE) - /* - * System call will setup a jump to the z_arm_do_syscall() function - * when the SVC returns via the bx lr. - * - * There is some trickery involved here because we have to preserve - * the original PC value so that we can return back to the caller of - * the SVC. - * - * On SVC exeption, the stack looks like the following: - * r0 - r1 - r2 - r3 - r12 - LR - PC - PSR - * - * Registers look like: - * r0 - arg1 - * r1 - arg2 - * r2 - arg3 - * r3 - arg4 - * r4 - arg5 - * r5 - arg6 - * r6 - call_id - * r8 - saved link register - */ -.L_do_syscall: -#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) - movs r3, #24 - ldr r1, [r0, r3] /* grab address of PC from stack frame */ - mov r8, r1 -#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) - ldr r8, [r0, #24] /* grab address of PC from stack frame */ -#endif - ldr r1, =z_arm_do_syscall -#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) - str r1, [r0, r3] /* overwrite the PC to point to z_arm_do_syscall */ -#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) - str r1, [r0, #24] /* overwrite the PC to point to z_arm_do_syscall */ -#endif - -#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) - ldr r3, =K_SYSCALL_LIMIT - cmp r6, r3 -#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) - /* validate syscall limit */ - ldr ip, =K_SYSCALL_LIMIT - cmp r6, ip -#endif - /* The supplied syscall_id must be lower than the limit - * (Requires unsigned integer comparison) - */ - blo .L_valid_syscall_id - - /* bad syscall id. Set arg1 to bad id and set call_id to SYSCALL_BAD */ - str r6, [r0] - ldr r6, =K_SYSCALL_BAD - - /* Bad syscalls treated as valid syscalls with ID K_SYSCALL_BAD. */ - -.L_valid_syscall_id: - ldr r0, =_kernel - ldr r0, [r0, #_kernel_offset_to_current] -#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) - mov ip, r2 - ldr r1, =_thread_offset_to_mode - ldr r3, [r0, r1] - movs r2, #1 - bics r3, r2 - /* Store (privileged) mode in thread's mode state variable */ - str r3, [r0, r1] - mov r2, ip - dsb - /* set mode to privileged, r2 still contains value from CONTROL */ - movs r3, #1 - bics r2, r3 -#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) - ldr r1, [r0, #_thread_offset_to_mode] - bic r1, #1 - /* Store (privileged) mode in thread's mode state variable */ - str r1, [r0, #_thread_offset_to_mode] - dsb - /* set mode to privileged, r2 still contains value from CONTROL */ - bic r2, #1 -#endif - msr CONTROL, r2 - - /* ISB is not strictly necessary here (stack pointer is not being - * touched), but it's recommended to avoid executing pre-fetched - * instructions with the previous privilege. - */ - isb - -#if defined(CONFIG_BUILTIN_STACK_GUARD) - /* Thread is now in privileged mode; after returning from SCVall it - * will use the default (user) stack before switching to the privileged - * stack to execute the system call. We need to protect the user stack - * against stack overflows until this stack transition. - */ - ldr r1, [r0, #_thread_offset_to_stack_info_start] /* stack_info.start */ - msr PSPLIM, r1 -#endif /* CONFIG_BUILTIN_STACK_GUARD */ - - /* return from SVC to the modified LR - z_arm_do_syscall */ - bx lr -#endif /* CONFIG_USERSPACE */ diff --git a/arch/arm/core/cortex_m/thread.c b/arch/arm/core/cortex_m/thread.c index 7abba0cdeef2c..835869f8bba5b 100644 --- a/arch/arm/core/cortex_m/thread.c +++ b/arch/arm/core/cortex_m/thread.c @@ -60,8 +60,6 @@ K_THREAD_STACK_DECLARE(z_main_stack, CONFIG_MAIN_STACK_SIZE); void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, char *stack_ptr, k_thread_entry_t entry, void *p1, void *p2, void *p3) { - struct __basic_sf *iframe; - #ifdef CONFIG_MPU_STACK_GUARD #if defined(CONFIG_USERSPACE) if (z_stack_is_user_capable(stack)) { @@ -85,17 +83,23 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, char *sta #endif /* FP_GUARD_EXTRA_SIZE */ #endif /* CONFIG_MPU_STACK_GUARD */ - iframe = Z_STACK_PTR_TO_FRAME(struct __basic_sf, stack_ptr); + void *entry_wrapper = z_thread_entry; + #if defined(CONFIG_USERSPACE) if ((thread->base.user_options & K_USER) != 0) { - iframe->pc = (uint32_t)arch_user_mode_enter; - } else { - iframe->pc = (uint32_t)z_thread_entry; + entry_wrapper = (void *)arch_user_mode_enter; } -#else - iframe->pc = (uint32_t)z_thread_entry; #endif +#ifdef CONFIG_USE_SWITCH + thread->switch_handle = arm_m_new_stack((char *)stack, stack_ptr - (char *)stack, + entry_wrapper, entry, p1, p2, p3); + thread->arch.iciit_pc = 0; +#else + struct __basic_sf *iframe = Z_STACK_PTR_TO_FRAME(struct __basic_sf, stack_ptr); + + iframe->pc = (uint32_t)entry_wrapper; + /* force ARM mode by clearing LSB of address */ iframe->pc &= 0xfffffffe; iframe->a1 = (uint32_t)entry; @@ -106,7 +110,11 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, char *sta iframe->xpsr = 0x01000000UL; /* clear all, thumb bit is 1, even if RO */ thread->callee_saved.psp = (uint32_t)iframe; +#endif + +#ifndef CONFIG_USE_SWITCH thread->arch.basepri = 0; +#endif #if defined(CONFIG_ARM_STORE_EXC_RETURN) || defined(CONFIG_USERSPACE) thread->arch.mode = 0; @@ -355,7 +363,7 @@ void configure_builtin_stack_guard(struct k_thread *thread) * @return The lowest allowed stack frame pointer, if error is a * thread stack corruption, otherwise return 0. */ -uint32_t z_check_thread_stack_fail(const uint32_t fault_addr, const uint32_t psp) +static uint32_t min_stack(const uint32_t fault_addr, const uint32_t psp) { #if defined(CONFIG_MULTITHREADING) const struct k_thread *thread = _current; @@ -418,6 +426,17 @@ uint32_t z_check_thread_stack_fail(const uint32_t fault_addr, const uint32_t psp return 0; } + +uint32_t z_check_thread_stack_fail(const uint32_t fault_addr, const uint32_t psp) +{ + uint32_t sp = min_stack(fault_addr, psp); + + if (sp != 0 && IS_ENABLED(CONFIG_USE_SWITCH)) { + sp += arm_m_switch_stack_buffer; + } + return sp; +} + #endif /* CONFIG_MPU_STACK_GUARD || CONFIG_USERSPACE */ #if defined(CONFIG_FPU) && defined(CONFIG_FPU_SHARING) diff --git a/arch/arm/core/cortex_m/vector_table.S b/arch/arm/core/cortex_m/vector_table.S index 45e10ea803499..5a43f977ab6af 100644 --- a/arch/arm/core/cortex_m/vector_table.S +++ b/arch/arm/core/cortex_m/vector_table.S @@ -74,7 +74,7 @@ SECTION_SUBSEC_FUNC(exc_vector_table,_vector_table_section,_vector_table) #error Unknown ARM architecture #endif /* CONFIG_ARMV6_M_ARMV8_M_BASELINE */ .word 0 -#if defined(CONFIG_MULTITHREADING) +#if defined(CONFIG_MULTITHREADING) && !defined(CONFIG_USE_SWITCH) .word z_arm_pendsv #else .word z_arm_exc_spurious diff --git a/arch/arm/core/offsets/offsets_aarch32.c b/arch/arm/core/offsets/offsets_aarch32.c index 693546630b05e..5147ced51eb2c 100644 --- a/arch/arm/core/offsets/offsets_aarch32.c +++ b/arch/arm/core/offsets/offsets_aarch32.c @@ -29,8 +29,10 @@ #include #include +#if !(defined(CONFIG_CPU_CORTEX_M) && defined(CONFIG_USE_SWITCH)) GEN_OFFSET_SYM(_thread_arch_t, basepri); GEN_OFFSET_SYM(_thread_arch_t, swap_return_value); +#endif #if defined(CONFIG_CPU_AARCH32_CORTEX_A) || defined(CONFIG_CPU_AARCH32_CORTEX_R) GEN_OFFSET_SYM(_thread_arch_t, exception_depth); diff --git a/arch/arm/core/userspace.S b/arch/arm/core/userspace.S index 3594940078da3..4576907948703 100644 --- a/arch/arm/core/userspace.S +++ b/arch/arm/core/userspace.S @@ -26,11 +26,21 @@ GTEXT(arch_user_string_nlen) GTEXT(z_arm_user_string_nlen_fault_start) GTEXT(z_arm_user_string_nlen_fault_end) GTEXT(z_arm_user_string_nlen_fixup) +GTEXT(z_syscall_exit_race1) +GTEXT(z_syscall_exit_race2) GDATA(_kernel) /* Imports */ GDATA(_k_syscall_table) +#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) +# define LOCK(reg) mov reg, #_EXC_IRQ_DEFAULT_PRIO ; msr BASEPRI, reg +# define UNLOCK(reg) mov reg, #0 ; msr BASEPRI, reg +#else +# define LOCK(unused) cpsid i +# define UNLOCK(unused) cpsie i +#endif + /** * * User space entry function @@ -47,7 +57,9 @@ SECTION_FUNC(TEXT,z_arm_userspace_enter) /* move user_entry to lr */ mov lr, r0 - /* prepare to set stack to privileged stack */ + LOCK(r0) + + /* prepare to set stack to privilege */ ldr r0, =_kernel ldr r0, [r0, #_kernel_offset_to_current] #if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) @@ -213,15 +225,8 @@ SECTION_FUNC(TEXT,z_arm_userspace_enter) * large enough to accommodate a maximum exception stack frame. */ - /* Temporarily store current IRQ locking status in ip */ - mrs ip, BASEPRI push {r0, ip} - /* Lock PendSV while reprogramming PSP and PSPLIM */ - mov r0, #_EXC_PENDSV_PRIO_MASK - msr BASEPRI_MAX, r0 - isb - /* Set PSPLIM to guard the thread's user stack. */ ldr r0, =_kernel ldr r0, [r0, #_kernel_offset_to_current] @@ -235,11 +240,7 @@ SECTION_FUNC(TEXT,z_arm_userspace_enter) msr PSP, r0 #endif -#if defined(CONFIG_BUILTIN_STACK_GUARD) - /* Restore interrupt lock status */ - msr BASEPRI, ip - isb -#endif + UNLOCK(r0) /* restore r0 */ mov r0, lr @@ -394,6 +395,8 @@ SECTION_FUNC(TEXT, z_arm_do_syscall) msr PSPLIM, ip #endif + UNLOCK(ip) + /* * r0-r5 contain arguments * r6 contains call_id @@ -478,6 +481,7 @@ dispatch_syscall: ldr lr, [sp,#16] #endif + LOCK(ip) #if defined(CONFIG_BUILTIN_STACK_GUARD) /* @@ -497,14 +501,6 @@ dispatch_syscall: * large enough to accommodate a maximum exception stack frame. */ - /* Temporarily store current IRQ locking status in r2 */ - mrs r2, BASEPRI - - /* Lock PendSV while reprogramming PSP and PSPLIM */ - mov r3, #_EXC_PENDSV_PRIO_MASK - msr BASEPRI_MAX, r3 - isb - /* Set PSPLIM to guard the thread's user stack. */ ldr r3, =_kernel ldr r3, [r3, #_kernel_offset_to_current] @@ -527,12 +523,6 @@ dispatch_syscall: msr PSP, ip #endif -#if defined(CONFIG_BUILTIN_STACK_GUARD) - /* Restore interrupt lock status */ - msr BASEPRI, r2 - isb -#endif - push {r0, r1} #if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) push {r2, r3} @@ -548,6 +538,7 @@ dispatch_syscall: /* drop privileges by setting bit 0 in CONTROL */ mrs r2, CONTROL orrs r2, r2, r3 + msr CONTROL, r2 pop {r2, r3} #elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) \ @@ -559,11 +550,19 @@ dispatch_syscall: /* Store (unprivileged) mode in thread's mode state variable */ str r1, [r0, #_thread_offset_to_mode] dsb + #if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) /* drop privileges by setting bit 0 in CONTROL */ mrs ip, CONTROL orrs ip, ip, #1 + + /* drop lock immediately before setting CONTROL, see comments in arm-m-switch.c */ + UNLOCK(r0) +z_syscall_exit_race1: msr CONTROL, ip +z_syscall_exit_race2: +#else + UNLOCK(r0) #endif #endif diff --git a/arch/arm/include/cortex_m/kernel_arch_func.h b/arch/arm/include/cortex_m/kernel_arch_func.h index 5abe5870762cf..f3e2d35b98c74 100644 --- a/arch/arm/include/cortex_m/kernel_arch_func.h +++ b/arch/arm/include/cortex_m/kernel_arch_func.h @@ -27,6 +27,8 @@ extern "C" { #endif #ifndef _ASMLANGUAGE +#include + extern void z_arm_fault_init(void); extern void z_arm_cpu_idle_init(void); #ifdef CONFIG_ARM_MPU @@ -61,10 +63,12 @@ static ALWAYS_INLINE void arch_kernel_init(void) #endif /* CONFIG_SOC_PER_CORE_INIT_HOOK */ } +#ifndef CONFIG_USE_SWITCH static ALWAYS_INLINE void arch_thread_return_value_set(struct k_thread *thread, unsigned int value) { thread->arch.swap_return_value = value; } +#endif #if !defined(CONFIG_MULTITHREADING) extern FUNC_NORETURN void z_arm_switch_to_main_no_multithreading(k_thread_entry_t main_func, @@ -79,6 +83,7 @@ extern FUNC_NORETURN void z_arm_userspace_enter(k_thread_entry_t user_entry, voi extern void z_arm_fatal_error(unsigned int reason, const struct arch_esf *esf); +#ifndef CONFIG_USE_SWITCH static ALWAYS_INLINE int arch_swap(unsigned int key) { /* store off key and return value */ @@ -96,6 +101,7 @@ static ALWAYS_INLINE int arch_swap(unsigned int key) */ return _current->arch.swap_return_value; } +#endif #endif /* _ASMLANGUAGE */ diff --git a/include/zephyr/arch/arm/arm-m-switch.h b/include/zephyr/arch/arm/arm-m-switch.h new file mode 100644 index 0000000000000..2bd9acf6b9aca --- /dev/null +++ b/include/zephyr/arch/arm/arm-m-switch.h @@ -0,0 +1,234 @@ +/* Copyright 2025 The ChromiumOS Authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef _ZEPHYR_ARCH_ARM_M_SWITCH_H +#define _ZEPHYR_ARCH_ARM_M_SWITCH_H + +#include +#include +#include + +/* GCC/gas has a code generation bugglet on thumb. The R7 register is + * the ABI-defined frame pointer, though it's usually unused in zephyr + * due to -fomit-frame-pointer (and the fact the DWARF on ARM doesn't + * really need it). But when it IS enabled, which sometimes seems to + * happen due to toolchain internals, GCC is unable to allow its use + * in the clobber list of an asm() block (I guess it can't generate + * spill/fill code without using the frame?). + * + * When absolutely needed, this kconfig unmasks a workaround where we + * spill/fill R7 around the switch manually. + */ +#ifdef CONFIG_ARM_GCC_FP_WORKAROUND +#define _R7_CLOBBER_OPT(expr) expr +#else +#define _R7_CLOBBER_OPT(expr) /**/ +#endif + +/* Should probably be in kconfig, basically this is testing whether or + * not the toolchain will allow a "g" flag (DSP state) to an "msr + * adsp_" instruction. + */ +#if defined(CONFIG_CPU_CORTEX_M4) || defined(CONFIG_CPU_CORTEX_M7) || defined(CONFIG_ARMV8_M_DSP) +#define _ARM_M_SWITCH_HAVE_DSP +#endif + +void *arm_m_new_stack(char *base, uint32_t sz, void *entry, void *arg0, void *arg1, void *arg2, + void *arg3); + +bool arm_m_must_switch(void); + +void arm_m_exc_exit(void); + +bool arm_m_iciit_check(uint32_t msp, uint32_t psp, uint32_t lr); + +void arm_m_iciit_stub(void); + +extern uint32_t *arm_m_exc_lr_ptr; + +void z_arm_configure_dynamic_mpu_regions(struct k_thread *thread); + +extern uintptr_t z_arm_tls_ptr; + +extern uint32_t arm_m_switch_stack_buffer; + +/* Global pointers to the frame locations for the callee-saved + * registers. Set in arm_m_must_switch(), and used by the fixup + * assembly in arm_m_exc_exit. + */ +struct arm_m_cs_ptrs { + void *out, *in, *lr_save, *lr_fixup; +}; +extern struct arm_m_cs_ptrs arm_m_cs_ptrs; + +static inline void arm_m_exc_tail(void) +{ +#ifdef CONFIG_MULTITHREADING + /* Dirty trickery. We defer as much interrupt-exit work until + * the very last moment, when the top-level ISR returns back + * into user code. We do this by replacing the topmost (!) LR + * return address in the stack frame with our fixup code at + * arm_m_exc_exit(). By running after the ISR return, it + * knows that the callee-save registers r4-r11 (which need to + * be saved to the outgoing thread) are restored. + * + * Obviously this only works if the ISR is "ABI-compliant + * enough". It doesn't have to have pushed a complete frame, + * but it does have to have put LR into its standard location. + * In practice generated code does (because it has to store LR + * somewhere so it can call other functions and then pop it to + * return), so this works even on code built with + * -fomit-frame-pointer. If an app needs a direct interrupt + * and can't meet these requirents, it can always skip this + * call and return directly (reschedule is optional for direct + * interrupts anyway). + * + * Finally note the call to check_stack_sentinel here: that is + * normally called from context switch at the end, but will + * toss an exception, which we can't allow (without hardship) + * on the path from here to interrupt exit. It will mess up + * our bookeeping around EXC_RETURN, so do it early. + */ + void z_check_stack_sentinel(void); + void *isr_lr = (void *)*arm_m_exc_lr_ptr; + + if (IS_ENABLED(CONFIG_STACK_SENTINEL)) { + z_check_stack_sentinel(); + } + if (isr_lr != arm_m_cs_ptrs.lr_fixup) { + arm_m_cs_ptrs.lr_save = isr_lr; + *arm_m_exc_lr_ptr = (uint32_t)arm_m_cs_ptrs.lr_fixup; + } +#endif +} + +static ALWAYS_INLINE void arm_m_switch(void *switch_to, void **switched_from) +{ +#if defined(CONFIG_USERSPACE) || defined(CONFIG_MPU_STACK_GUARD) + z_arm_configure_dynamic_mpu_regions(_current); +#endif + +#ifdef CONFIG_THREAD_LOCAL_STORAGE + z_arm_tls_ptr = _current->tls; +#endif + +#if defined(CONFIG_USERSPACE) && defined(CONFIG_USE_SWITCH) + /* Set things up to write the CONTROL.nPRIV bit. We know the outgoing + * thread is in privileged mode (because you can't reach a + * context switch unless you're in the kernel!). + */ + extern uint32_t arm_m_switch_control; + uint32_t control; + + __asm__ volatile("mrs %0, control" : "=r"(control)); + arm_m_switch_control = (control & ~1) | (_current->arch.mode & 1); +#endif + + /* new switch handle in r4, old switch handle pointer in r5. + * r6-r8 are used by the code here, and r9-r11 are + * unsaved/clobbered (they are very likely to be caller-saved + * registers in the enclosing function that the compiler can + * avoid using, i.e. we can let it make the call and avoid a + * double-spill). But all registers are restored fully + * (because we might be switching to an interrupt-saved frame) + */ + register uint32_t r4 __asm__("r4") = (uint32_t)switch_to; + register uint32_t r5 __asm__("r5") = (uint32_t)switched_from; + __asm__ volatile(_R7_CLOBBER_OPT("push {r7};") + /* Construct and push a {r12, lr, pc} group at the top + * of the frame, where PC points to the final restore location + * at the end of this sequence. + */ + "mov r6, r12;" + "mov r7, lr;" + "ldr r8, =3f;" /* address of restore PC */ + "add r8, r8, #1;" /* set thumb bit */ + "push {r6-r8};" + "sub sp, sp, #24;" /* skip over space for r6-r11 */ + "push {r0-r5};" + "mov r2, #0x01000000;" /* APSR (only care about thumb bit) */ + "mov r0, #0;" /* Leave r0 zero for code blow */ +#ifdef CONFIG_BUILTIN_STACK_GUARD + "mrs r1, psplim;" + "push {r1-r2};" + "msr psplim, r0;" /* zero it so we can move the stack */ +#else + "push {r2};" +#endif + +#ifdef CONFIG_FPU + /* Push FPU state (if active) to our outgoing stack */ + " mrs r8, control;" /* read CONTROL.FPCA */ + " and r7, r8, #4;" /* r7 == have_fpu */ + " cbz r7, 1f;" + " bic r8, r8, #4;" /* clear CONTROL.FPCA */ + " msr control, r8;" + " vmrs r6, fpscr;" + " push {r6};" + " vpush {s0-s31};" + "1: push {r7};" /* have_fpu word */ + + /* Pop FPU state (if present) from incoming frame in r4 */ + " ldm r4!, {r7};" /* have_fpu word */ + " cbz r7, 2f;" + " vldm r4!, {s0-s31};" /* (note: sets FPCA bit for us) */ + " ldm r4!, {r6};" + " vmsr fpscr, r6;" + "2:;" +#endif + +#if defined(CONFIG_USERSPACE) && defined(CONFIG_USE_SWITCH) + " ldr r8, =arm_m_switch_control;" + " ldr r8, [r8];" +#endif + + /* Save the outgoing switch handle (which is SP), swap stacks, + * and enable interrupts. The restore process is + * interruptible code (running in the incoming thread) once + * the stack is valid. + */ + "str sp, [r5];" + "mov sp, r4;" + "msr basepri, r0;" + +#if defined(CONFIG_USERSPACE) && defined(CONFIG_USE_SWITCH) + " msr control, r8;" /* Now we can drop privilege */ +#endif + + /* Restore is super simple: pop the flags (and stack limit if + * enabled) then slurp in the whole GPR set in two + * instructions. (The instruction encoding disallows popping + * both LR and PC in a single instruction) + */ +#ifdef CONFIG_BUILTIN_STACK_GUARD + "pop {r1-r2};" + "msr psplim, r1;" +#else + "pop {r2};" +#endif +#ifdef _ARM_M_SWITCH_HAVE_DSP + "msr apsr_nzcvqg, r2;" /* bonkers syntax */ +#else + "msr apsr_nzcvq, r2;" /* not even source-compatible! */ +#endif + "pop {r0-r12, lr};" + "pop {pc};" + + "3:" /* Label for restore address */ + _R7_CLOBBER_OPT("pop {r7};")::"r"(r4), + "r"(r5) + : "r6", "r8", "r9", "r10", +#ifndef CONFIG_ARM_GCC_FP_WORKAROUND + "r7", +#endif + "r11"); +} + +#ifdef CONFIG_USE_SWITCH +static ALWAYS_INLINE void arch_switch(void *switch_to, void **switched_from) +{ + arm_m_switch(switch_to, switched_from); +} +#endif + +#endif /* _ZEPHYR_ARCH_ARM_M_SWITCH_H */ diff --git a/include/zephyr/arch/arm/cortex_m/exception.h b/include/zephyr/arch/arm/cortex_m/exception.h index d675efa1549d0..2b90e797b2799 100644 --- a/include/zephyr/arch/arm/cortex_m/exception.h +++ b/include/zephyr/arch/arm/cortex_m/exception.h @@ -15,6 +15,9 @@ #include #include +#ifndef _ASMLANGUAGE +#include +#endif /* for assembler, only works with constants */ #define Z_EXC_PRIO(pri) (((pri) << (8 - NUM_IRQ_PRIO_BITS)) & 0xff) @@ -125,7 +128,14 @@ struct arch_esf { extern uint32_t z_arm_coredump_fault_sp; +#ifdef CONFIG_USE_SWITCH +static inline void z_arm_exc_exit(void) +{ + arm_m_exc_tail(); +} +#else extern void z_arm_exc_exit(void); +#endif #ifdef __cplusplus } diff --git a/include/zephyr/arch/arm/irq.h b/include/zephyr/arch/arm/irq.h index 40d467aa3d318..a63098d4a0138 100644 --- a/include/zephyr/arch/arm/irq.h +++ b/include/zephyr/arch/arm/irq.h @@ -18,6 +18,9 @@ #include #include +#if !defined(_ASMLANGUAGE) && defined(CONFIG_CPU_CORTEX_M) +#include +#endif #ifdef __cplusplus extern "C" { @@ -74,7 +77,14 @@ void z_soc_irq_eoi(unsigned int irq); #endif +#if defined(CONFIG_CPU_CORTEX_M) && defined(CONFIG_USE_SWITCH) +static inline void z_arm_int_exit(void) +{ + arm_m_exc_tail(); +} +#else extern void z_arm_int_exit(void); +#endif extern void z_arm_interrupt_init(void); @@ -144,9 +154,6 @@ extern void _arch_isr_direct_pm(void); #define ARCH_ISR_DIRECT_HEADER() arch_isr_direct_header() #define ARCH_ISR_DIRECT_FOOTER(swap) arch_isr_direct_footer(swap) -/* arch/arm/core/exc_exit.S */ -extern void z_arm_int_exit(void); - #ifdef CONFIG_TRACING_ISR extern void sys_trace_isr_enter(void); extern void sys_trace_isr_exit(void); diff --git a/include/zephyr/arch/arm/thread.h b/include/zephyr/arch/arm/thread.h index 139f606809f9d..075a00f623e06 100644 --- a/include/zephyr/arch/arm/thread.h +++ b/include/zephyr/arch/arm/thread.h @@ -22,7 +22,15 @@ #ifndef _ASMLANGUAGE #include +/* Cortex M's USE_SWITCH implementation is somewhat unique and doesn't + * use much of the thread struct + */ +#if defined(CONFIG_CPU_CORTEX_M) && defined(CONFIG_USE_SWITCH) +#define _ARM_M_SWITCH +#endif + struct _callee_saved { +#ifndef _ARM_M_SWITCH uint32_t v1; /* r4 */ uint32_t v2; /* r5 */ uint32_t v3; /* r6 */ @@ -35,12 +43,14 @@ struct _callee_saved { #ifdef CONFIG_USE_SWITCH uint32_t lr; /* lr */ #endif +#endif /* !_ARM_M_SWITCH */ }; typedef struct _callee_saved _callee_saved_t; #if defined(CONFIG_FPU) && defined(CONFIG_FPU_SHARING) struct _preempt_float { +#ifndef _ARM_M_SWITCH float s16; float s17; float s18; @@ -57,16 +67,24 @@ struct _preempt_float { float s29; float s30; float s31; +#endif /* !_ARM_M_SWITCH */ }; #endif struct _thread_arch { +#ifndef _ARM_M_SWITCH /* interrupt locking key */ uint32_t basepri; /* r0 in stack frame cannot be written to reliably */ uint32_t swap_return_value; +#endif + +#ifdef _ARM_M_SWITCH + uint32_t iciit_pc; + uint32_t iciit_apsr; +#endif #if defined(CONFIG_FPU) && defined(CONFIG_FPU_SHARING) /* diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 075e091cbb62d..d02cafcf698d3 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -61,6 +61,25 @@ list(APPEND kernel_files version.c ) +# Toolchain workaround: on ARM Thumb, gcc will normally enable +# -fomit-frame-pointer at all optimization levels except -O0. When +# frame pointers ARE enabled, gcc will also infuriatingly disallow a +# clobber of the r7 frame pointer register in inline assembly +# (vs. just spilling filling as needed). Our optimized arch_switch +# requires the ability to clobber this (callee-saved) register. Turn +# the flag back on for the two files that need it. (The error +# behavior only gets modified at the level of a whole file, I tried +# #pragmas and attributes to change the code generation for the +# specific functions without success). +if(CONFIG_CPU_CORTEX_M) + if(CONFIG_NO_OPTIMIZATIONS) + if(${COMPILER} STREQUAL "gcc") + set_source_files_properties(init.c PROPERTIES COMPILE_OPTIONS -fomit-frame-pointer) + set_source_files_properties(sched.c PROPERTIES COMPILE_OPTIONS -fomit-frame-pointer) + endif() + endif() +endif() + if(CONFIG_SCHED_CPU_MASK) list(APPEND kernel_files cpu_mask.c diff --git a/kernel/include/ksched.h b/kernel/include/ksched.h index 171e25cc37beb..fa11f9a1e4768 100644 --- a/kernel/include/ksched.h +++ b/kernel/include/ksched.h @@ -63,6 +63,26 @@ int z_unpend_all(_wait_q_t *wait_q); bool z_thread_prio_set(struct k_thread *thread, int prio); void *z_get_next_switch_handle(void *interrupted); +/* Wrapper around z_get_next_switch_handle() for the benefit of + * non-SMP platforms that always pass a NULL interrupted handle. + * Exposes the (extremely) common early exit case in a context that + * can be (much) better optimized in surrounding C code. Takes a + * _current pointer that the outer scope surely already has to avoid + * having to refetch it across a lock. + * + * Mystic arts for cycle nerds, basically. Replace with + * z_get_next_switch_handle() if this becomes a maintenance hassle. + */ +static ALWAYS_INLINE void *z_sched_next_handle(struct k_thread *curr) +{ +#ifndef CONFIG_SMP + if (curr == _kernel.ready_q.cache) { + return NULL; + } +#endif + return z_get_next_switch_handle(NULL); +} + void z_time_slice(void); void z_reset_time_slice(struct k_thread *curr); void z_sched_ipi(void); diff --git a/kernel/include/kswap.h b/kernel/include/kswap.h index cff3efab6e90e..64eca71baf1f6 100644 --- a/kernel/include/kswap.h +++ b/kernel/include/kswap.h @@ -114,11 +114,18 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key, if (is_spinlock && lock != NULL && lock != &_sched_spinlock) { k_spin_release(lock); } - if (!is_spinlock || lock != &_sched_spinlock) { - (void) k_spin_lock(&_sched_spinlock); + if (IS_ENABLED(CONFIG_SMP) || IS_ENABLED(CONFIG_SPIN_VALIDATE)) { + /* Taking a nested uniprocessor lock in void context is a noop */ + if (!is_spinlock || lock != &_sched_spinlock) { + (void)k_spin_lock(&_sched_spinlock); + } } +#ifdef CONFIG_SMP new_thread = z_swap_next_thread(); +#else + new_thread = _kernel.ready_q.cache; +#endif if (new_thread != old_thread) { z_sched_usage_switch(new_thread); diff --git a/kernel/sched.c b/kernel/sched.c index 7ed11942107c3..066d5f0e4503e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -492,6 +492,43 @@ static inline void z_vrfy_k_thread_suspend(k_tid_t thread) #include #endif /* CONFIG_USERSPACE */ +static inline bool resched(uint32_t key) +{ +#ifdef CONFIG_SMP + _current_cpu->swap_ok = 0; +#endif /* CONFIG_SMP */ + + return arch_irq_unlocked(key) && !arch_is_in_isr(); +} + +/* + * Check if the next ready thread is the same as the current thread + * and save the trip if true. + */ +static inline bool need_swap(void) +{ + /* the SMP case will be handled in C based z_swap() */ +#ifdef CONFIG_SMP + return true; +#else + struct k_thread *new_thread; + + /* Check if the next ready thread is the same as the current thread */ + new_thread = _kernel.ready_q.cache; + return new_thread != _current; +#endif /* CONFIG_SMP */ +} + +static void reschedule(struct k_spinlock *lock, k_spinlock_key_t key) +{ + if (resched(key.key) && need_swap()) { + z_swap(lock, key); + } else { + k_spin_unlock(lock, key); + signal_pending_ipi(); + } +} + void z_impl_k_thread_resume(k_tid_t thread) { SYS_PORT_TRACING_OBJ_FUNC_ENTER(k_thread, resume, thread); @@ -507,7 +544,7 @@ void z_impl_k_thread_resume(k_tid_t thread) z_mark_thread_as_not_suspended(thread); ready_thread(thread); - z_reschedule(&_sched_spinlock, key); + reschedule(&_sched_spinlock, key); SYS_PORT_TRACING_OBJ_FUNC_EXIT(k_thread, resume, thread); } @@ -719,41 +756,9 @@ bool z_thread_prio_set(struct k_thread *thread, int prio) return need_sched; } -static inline bool resched(uint32_t key) -{ -#ifdef CONFIG_SMP - _current_cpu->swap_ok = 0; -#endif /* CONFIG_SMP */ - - return arch_irq_unlocked(key) && !arch_is_in_isr(); -} - -/* - * Check if the next ready thread is the same as the current thread - * and save the trip if true. - */ -static inline bool need_swap(void) -{ - /* the SMP case will be handled in C based z_swap() */ -#ifdef CONFIG_SMP - return true; -#else - struct k_thread *new_thread; - - /* Check if the next ready thread is the same as the current thread */ - new_thread = _kernel.ready_q.cache; - return new_thread != _current; -#endif /* CONFIG_SMP */ -} - void z_reschedule(struct k_spinlock *lock, k_spinlock_key_t key) { - if (resched(key.key) && need_swap()) { - z_swap(lock, key); - } else { - k_spin_unlock(lock, key); - signal_pending_ipi(); - } + reschedule(lock, key); } void z_reschedule_irqlock(uint32_t key) @@ -782,25 +787,22 @@ void k_sched_lock(void) void k_sched_unlock(void) { - K_SPINLOCK(&_sched_spinlock) { - __ASSERT(_current->base.sched_locked != 0U, ""); - __ASSERT(!arch_is_in_isr(), ""); - - ++_current->base.sched_locked; - update_cache(0); - } - LOG_DBG("scheduler unlocked (%p:%d)", _current, _current->base.sched_locked); SYS_PORT_TRACING_FUNC(k_thread, sched_unlock); - z_reschedule_unlocked(); + k_spinlock_key_t key = k_spin_lock(&_sched_spinlock); + + __ASSERT(_current->base.sched_locked != 0U, ""); + __ASSERT(!arch_is_in_isr(), ""); + ++_current->base.sched_locked; + update_cache(0); + reschedule(&_sched_spinlock, key); } struct k_thread *z_swap_next_thread(void) { -#ifdef CONFIG_SMP struct k_thread *ret = next_up(); if (ret == _current) { @@ -811,9 +813,6 @@ struct k_thread *z_swap_next_thread(void) signal_pending_ipi(); } return ret; -#else - return _kernel.ready_q.cache; -#endif /* CONFIG_SMP */ } #ifdef CONFIG_USE_SWITCH @@ -1048,7 +1047,7 @@ void z_impl_k_reschedule(void) update_cache(0); - z_reschedule(&_sched_spinlock, key); + reschedule(&_sched_spinlock, key); } #ifdef CONFIG_USERSPACE @@ -1189,7 +1188,7 @@ void z_impl_k_wakeup(k_tid_t thread) z_abort_thread_timeout(thread); z_mark_thread_as_not_sleeping(thread); ready_thread(thread); - z_reschedule(&_sched_spinlock, key); + reschedule(&_sched_spinlock, key); } else { k_spin_unlock(&_sched_spinlock, key); } diff --git a/samples/sensor/sensor_shell/prj.conf b/samples/sensor/sensor_shell/prj.conf index d3189e39aeac7..113cf803f5c7f 100644 --- a/samples/sensor/sensor_shell/prj.conf +++ b/samples/sensor/sensor_shell/prj.conf @@ -6,3 +6,5 @@ CONFIG_SENSOR_INFO=y CONFIG_LOG=y CONFIG_RTIO_CONSUME_SEM=y + +CONFIG_MAIN_STACK_SIZE=2048 diff --git a/samples/subsys/ipc/openamp/remote/prj.conf b/samples/subsys/ipc/openamp/remote/prj.conf index ffe8ef3422975..85541ba949c82 100644 --- a/samples/subsys/ipc/openamp/remote/prj.conf +++ b/samples/subsys/ipc/openamp/remote/prj.conf @@ -6,3 +6,4 @@ CONFIG_HEAP_MEM_POOL_SIZE=4096 CONFIG_OPENAMP=y CONFIG_SEGGER_RTT_BUFFER_SIZE_UP=4096 CONFIG_OPENAMP_MASTER=n +CONFIG_MAIN_STACK_SIZE=3172 diff --git a/soc/nxp/imxrt/imxrt5xx/cm33/soc.c b/soc/nxp/imxrt/imxrt5xx/cm33/soc.c index b826080323f12..38fb023600a21 100644 --- a/soc/nxp/imxrt/imxrt5xx/cm33/soc.c +++ b/soc/nxp/imxrt/imxrt5xx/cm33/soc.c @@ -99,6 +99,12 @@ extern void z_arm_pendsv(void); extern void sys_clock_isr(void); extern void z_arm_exc_spurious(void); +#ifdef CONFIG_USE_SWITCH +#define PENDSV_VEC z_arm_exc_spurious +#else +#define PENDSV_VEC z_arm_pendsv +#endif + __imx_boot_ivt_section void (*const image_vector_table[])(void) = { (void (*)())(z_main_stack + CONFIG_MAIN_STACK_SIZE), /* 0x00 */ z_arm_reset, /* 0x04 */ @@ -118,7 +124,7 @@ __imx_boot_ivt_section void (*const image_vector_table[])(void) = { z_arm_svc, /* 0x2C */ z_arm_debug_monitor, /* 0x30 */ (void (*)())image_vector_table, /* 0x34, imageLoadAddress. */ - z_arm_pendsv, /* 0x38 */ + PENDSV_VEC, /* 0x38 */ #if defined(CONFIG_SYS_CLOCK_EXISTS) && defined(CONFIG_CORTEX_M_SYSTICK_INSTALL_ISR) sys_clock_isr, /* 0x3C */ #else diff --git a/soc/nxp/imxrt/imxrt6xx/cm33/soc.c b/soc/nxp/imxrt/imxrt6xx/cm33/soc.c index 68f5b9faf16db..4c65d79e44fcf 100644 --- a/soc/nxp/imxrt/imxrt6xx/cm33/soc.c +++ b/soc/nxp/imxrt/imxrt6xx/cm33/soc.c @@ -93,6 +93,12 @@ extern void z_arm_pendsv(void); extern void sys_clock_isr(void); extern void z_arm_exc_spurious(void); +#ifdef CONFIG_USE_SWITCH +#define PENDSV_VEC z_arm_exc_spurious +#else +#define PENDSV_VEC z_arm_pendsv +#endif + __imx_boot_ivt_section void (*const image_vector_table[])(void) = { (void (*)())(z_main_stack + CONFIG_MAIN_STACK_SIZE), /* 0x00 */ z_arm_reset, /* 0x04 */ @@ -112,7 +118,7 @@ __imx_boot_ivt_section void (*const image_vector_table[])(void) = { z_arm_svc, /* 0x2C */ z_arm_debug_monitor, /* 0x30 */ (void (*)())image_vector_table, /* 0x34, imageLoadAddress. */ - z_arm_pendsv, /* 0x38 */ + PENDSV_VEC, /* 0x38 */ #if defined(CONFIG_SYS_CLOCK_EXISTS) && defined(CONFIG_CORTEX_M_SYSTICK_INSTALL_ISR) sys_clock_isr, /* 0x3C */ #else diff --git a/soc/nxp/rw/soc.c b/soc/nxp/rw/soc.c index 3e928123c496c..c23444f179d7d 100644 --- a/soc/nxp/rw/soc.c +++ b/soc/nxp/rw/soc.c @@ -39,6 +39,12 @@ extern void z_arm_pendsv(void); extern void sys_clock_isr(void); extern void z_arm_exc_spurious(void); +#ifdef CONFIG_USE_SWITCH +#define PENDSV_VEC z_arm_exc_spurious +#else +#define PENDSV_VEC z_arm_pendsv +#endif + __imx_boot_ivt_section void (*const image_vector_table[])(void) = { (void (*)())(z_main_stack + CONFIG_MAIN_STACK_SIZE), /* 0x00 */ z_arm_reset, /* 0x04 */ @@ -58,7 +64,7 @@ __imx_boot_ivt_section void (*const image_vector_table[])(void) = { z_arm_svc, /* 0x2C */ z_arm_debug_monitor, /* 0x30 */ (void (*)())image_vector_table, /* 0x34, imageLoadAddress. */ - z_arm_pendsv, /* 0x38 */ + PENDSV_VEC, /* 0x38 */ #if defined(CONFIG_SYS_CLOCK_EXISTS) && defined(CONFIG_CORTEX_M_SYSTICK_INSTALL_ISR) sys_clock_isr, /* 0x3C */ #else diff --git a/subsys/debug/thread_info.c b/subsys/debug/thread_info.c index bcd58211e3bb8..ac5453ced8c40 100644 --- a/subsys/debug/thread_info.c +++ b/subsys/debug/thread_info.c @@ -57,6 +57,8 @@ const size_t _kernel_thread_info_offsets[] = { /* We are assuming that the SP of interest is SP_EL1 */ [THREAD_INFO_OFFSET_T_STACK_PTR] = offsetof(struct k_thread, callee_saved.sp_elx), +#elif defined(CONFIG_CPU_CORTEX_M) && defined(CONFIG_USE_SWITCH) + [THREAD_INFO_OFFSET_T_STACK_PTR] = offsetof(struct k_thread, switch_handle), #elif defined(CONFIG_ARM) [THREAD_INFO_OFFSET_T_STACK_PTR] = offsetof(struct k_thread, callee_saved.psp), diff --git a/tests/arch/arm/arm_interrupt/src/arm_interrupt.c b/tests/arch/arm/arm_interrupt/src/arm_interrupt.c index 05535283be332..f47eac5646d93 100644 --- a/tests/arch/arm/arm_interrupt/src/arm_interrupt.c +++ b/tests/arch/arm/arm_interrupt/src/arm_interrupt.c @@ -371,8 +371,19 @@ void z_impl_test_arm_user_interrupt_syscall(void) #if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE) /* Confirm IRQs are not locked */ zassert_false(__get_PRIMASK(), "PRIMASK is set\n"); -#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) - +#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) && defined(CONFIG_USE_SWITCH) + /* Confirm IRQs are not locked */ + zassert_false(__get_BASEPRI(), "BASEPRI is set\n"); +#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) && !defined(CONFIG_USE_SWITCH) + /* The original ARMv7M userspace implementation had the + * somewhat odd (and dangerous!) behavior that an interrupt + * mask done in a system call would persist (!!) on return + * into userspace, and be seen again once we were back in + * kernel mode, and this is a test for it. It's legacy + * though, not something we ever documented, and not the way + * userspace works (or even can work!) on other platforms, nor + * with the newer arch_switch() code. Consider removing. + */ static bool first_call = 1; if (first_call == 1) { diff --git a/tests/arch/arm/arm_interrupt/testcase.yaml b/tests/arch/arm/arm_interrupt/testcase.yaml index df47b02fccd93..09241a3664b0a 100644 --- a/tests/arch/arm/arm_interrupt/testcase.yaml +++ b/tests/arch/arm/arm_interrupt/testcase.yaml @@ -18,6 +18,6 @@ tests: - CONFIG_MAIN_STACK_SIZE=2048 - CONFIG_ZTEST_STACK_SIZE=1280 arch.arm.interrupt.extra_exception_info: - filter: not CONFIG_TRUSTED_EXECUTION_NONSECURE + filter: ARCH_HAS_EXTRA_EXCEPTION_INFO and not CONFIG_TRUSTED_EXECUTION_NONSECURE extra_configs: - CONFIG_EXTRA_EXCEPTION_INFO=y diff --git a/tests/arch/arm/arm_switch/CMakeLists.txt b/tests/arch/arm/arm_switch/CMakeLists.txt new file mode 100644 index 0000000000000..f78d1c206421f --- /dev/null +++ b/tests/arch/arm/arm_switch/CMakeLists.txt @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.20.0) + +find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE}) +project(arm_switch) + +target_sources(app PRIVATE src/main.c) + +# Need to unit test arch headers +target_include_directories(app PRIVATE ${ZEPHYR_BASE}/arch/${ARCH}/include) diff --git a/tests/arch/arm/arm_switch/prj.conf b/tests/arch/arm/arm_switch/prj.conf new file mode 100644 index 0000000000000..45473f9d6e344 --- /dev/null +++ b/tests/arch/arm/arm_switch/prj.conf @@ -0,0 +1,5 @@ +CONFIG_ZTEST=y +CONFIG_MULTITHREADING=n +CONFIG_FPU=y +CONFIG_MPU=n +CONFIG_ARM_MPU=n diff --git a/tests/arch/arm/arm_switch/src/main.c b/tests/arch/arm/arm_switch/src/main.c new file mode 100644 index 0000000000000..4e656e2781ea8 --- /dev/null +++ b/tests/arch/arm/arm_switch/src/main.c @@ -0,0 +1,205 @@ +/* Copyright 2025 The ChromiumOS Authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include +#include +#include +#include + +#ifdef CONFIG_MULTITHREADING +#error "Test does not work with CONFIG_MULTITHREADING=n" +#endif + +char stack[4096]; + +void *main_sh, *my_sh; + +/* Aforementioned dirty trick: this value becomes the "next switch + * handle" examined by the interrupt exit code in lieu of whatever the + * scheduler would return. + */ +void *next_sh; + +int sum; + +bool my_fn_ran; + +extern void *arm_m_last_switch_handle; + +void *z_get_next_switch_handle(void *interrupted) +{ + return next_sh; +} + +void my_fn(void *a, void *b, void *c, void *d) +{ + void *psplim; + + __asm__ volatile("mrs %0, psplim" : "=r"(psplim)); + + printk("%s: PSPLIM = %p\n", __func__, psplim); + + zassert_equal((int)a, 0); + zassert_equal((int)b, 1); + zassert_equal((int)c, 2); + + register int A = 11; + register int B = 12; + register int C = 13; + register int D = 14; + register int E = 15; + +#ifdef CONFIG_CPU_HAS_FPU + register float FA = 11.0f; + register float FB = 12.0f; + register float FC = 13.0f; +#endif + + for (int n = 0; /**/; n++) { + printk("%s:%d iter %d\n", __func__, __LINE__, n); + + if (arm_m_last_switch_handle) { + printk("Using exception handle @ %p\n", arm_m_last_switch_handle); + main_sh = arm_m_last_switch_handle; + arm_m_last_switch_handle = NULL; + } + + my_fn_ran = true; + + arm_m_switch(main_sh, &my_sh); + + zassert_equal(A, 11); + zassert_equal(B, 12); + zassert_equal(C, 13); + zassert_equal(D, 14); + zassert_equal(E, 15); +#ifdef CONFIG_CPU_HAS_FPU + zassert_equal(FA, 11.0f); + zassert_equal(FB, 12.0f); + zassert_equal(FC, 13.0f); +#endif + } +} + +/* Nothing in particular to do here except exercise the interrupt exit hook */ +void my_svc(void) +{ + printk("%s:%d\n", __func__, __LINE__); + arm_m_exc_tail(); + + /* Validate that the tail hook doesn't need to be last */ + printk(" arm_m_exc_tail() has been called\n"); +} + +ZTEST(arm_m_switch, test_smoke) +{ + void *psplim; + + __asm__ volatile("mrs %0, psplim" : "=r"(psplim)); + printk("In main, PSPLIM = %p\n", psplim); + + /* "register" variables don't strictly force the compiler not + * to spill them, but inspecting the generated code shows it's + * leaving them in place across the switch. + */ + register int A = 1; + register int B = 2; + register int C = 3; + register int D = 4; + register int E = 5; + +#ifdef CONFIG_CPU_HAS_FPU + /* Prime all the FPU registers with something I can recognize in a debugger */ + uint32_t sregs[32]; + + for (int i = 0; i < 32; i++) { + sregs[i] = 0x3f800000 + i; + } + __asm__ volatile("vldm %0, {s0-s31}" ::"r"(sregs)); + + register float FA = 1.0f; + register float FB = 2.0f; + register float FC = 3.0f; +#endif + + sum += A + B + C + D + E; + + /* Hit an interrupt and make sure CPU state doesn't get messed up */ + printk("Invoking SVC\n"); + __asm__ volatile("svc 0"); + printk("...back\n"); + + zassert_equal(A, 1); + zassert_equal(B, 2); + zassert_equal(C, 3); + zassert_equal(D, 4); + zassert_equal(E, 5); +#ifdef CONFIG_CPU_HAS_FPU + zassert_equal(FA, 1.0f); + zassert_equal(FB, 2.0f); + zassert_equal(FC, 3.0f); +#endif + + /* Now likewise switch to and from a foreign stack and check */ + my_sh = arm_m_new_stack(stack, sizeof(stack), my_fn, (void *)0, (void *)1, (void *)2, NULL); + + int cycles = 16; + + for (int n = 0; n < cycles; n++) { + printk("main() switching to my_fn() (iter %d)...\n", n); + my_fn_ran = false; + arm_m_switch(my_sh, &main_sh); + printk("...and back\n"); + + zassert_true(my_fn_ran); + zassert_equal(A, 1); + zassert_equal(B, 2); + zassert_equal(C, 3); + zassert_equal(D, 4); + zassert_equal(E, 5); +#ifdef CONFIG_CPU_HAS_FPU + zassert_equal(FA, 1.0f); + zassert_equal(FB, 2.0f); + zassert_equal(FC, 3.0f); +#endif + } +} + +/* Makes a copy of the vector table in writable RAM (it's generally in + * a ROM section), redirects it, and hooks the SVC interrupt with our + * own code above so we can catch direct interrupts. + */ +void *vector_hijack(void) +{ + static uint32_t __aligned(1024) vectors[256]; + uint32_t *vtor_p = (void *)0xe000ed08; + uint32_t *vtor = (void *)*vtor_p; + + printk("VTOR @%p\n", vtor); + if (vtor == NULL) { + /* mps2/an385 doesn't set this up, don't know why */ + printk("VTOR not set up by SOC, skipping case\n"); + ztest_test_skip(); + return NULL; + } + + /* Vector count: _vector_start/end set by the linker. */ + int nv = (&_vector_end[0] - &_vector_start[0]) / sizeof(uint32_t); + + for (int i = 0; i < nv; i++) { + vectors[i] = vtor[i]; + } + *vtor_p = (uint32_t)&vectors[0]; + vtor = (void *)*vtor_p; + printk("VTOR now @%p\n", vtor); + + /* And hook the SVC call with our own function above, allowing + * us direct access to interrupt entry + */ + vtor[11] = (int)my_svc; + printk("vtor[11] == %p (my_svc == %p)\n", (void *)vtor[11], my_svc); + + return NULL; +} + +ZTEST_SUITE(arm_m_switch, NULL, vector_hijack, NULL, NULL, NULL); diff --git a/tests/arch/arm/arm_switch/testcase.yaml b/tests/arch/arm/arm_switch/testcase.yaml new file mode 100644 index 0000000000000..12d2439155657 --- /dev/null +++ b/tests/arch/arm/arm_switch/testcase.yaml @@ -0,0 +1,6 @@ +tests: + arch.arm.switch: + arch_allow: arm + filter: CONFIG_CPU_CORTEX_M and CONFIG_USE_SWITCH + tags: + - arm diff --git a/tests/arch/arm/arm_thread_swap/testcase.yaml b/tests/arch/arm/arm_thread_swap/testcase.yaml index e26d88429050f..bf04663e7a03a 100644 --- a/tests/arch/arm/arm_thread_swap/testcase.yaml +++ b/tests/arch/arm/arm_thread_swap/testcase.yaml @@ -1,5 +1,6 @@ common: filter: CONFIG_ARMV6_M_ARMV8_M_BASELINE or CONFIG_ARMV7_M_ARMV8_M_MAINLINE + and not CONFIG_USE_SWITCH tags: - arm - fpu diff --git a/tests/benchmarks/sched_queues/testcase.yaml b/tests/benchmarks/sched_queues/testcase.yaml index e60bcb1fef2d2..3813223fd77df 100644 --- a/tests/benchmarks/sched_queues/testcase.yaml +++ b/tests/benchmarks/sched_queues/testcase.yaml @@ -28,6 +28,7 @@ tests: benchmark.sched_queues.scalable: extra_configs: - CONFIG_SCHED_SCALABLE=y + - CONFIG_ARM_GCC_FP_WORKAROUND=y benchmark.sched_queues.multiq: extra_configs: diff --git a/tests/benchmarks/swap/CMakeLists.txt b/tests/benchmarks/swap/CMakeLists.txt new file mode 100644 index 0000000000000..271fa88a90fd7 --- /dev/null +++ b/tests/benchmarks/swap/CMakeLists.txt @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.20.0) + +find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE}) +project(hello_world) + +target_sources(app PRIVATE src/main.c) + +# Needed to call z_swap() +target_include_directories(app PRIVATE ${ZEPHYR_BASE}/kernel/include) +target_include_directories(app PRIVATE ${ZEPHYR_BASE}/arch/${ARCH}/include) diff --git a/tests/benchmarks/swap/prj.conf b/tests/benchmarks/swap/prj.conf new file mode 100644 index 0000000000000..a4c53f742e6cc --- /dev/null +++ b/tests/benchmarks/swap/prj.conf @@ -0,0 +1,9 @@ +CONFIG_SPIN_VALIDATE=n +CONFIG_ASSERT=n +CONFIG_MPU=n +CONFIG_ARM_MPU=n +CONFIG_FPU=n +CONFIG_TIMESLICING=n +CONFIG_HW_STACK_PROTECTION=n +CONFIG_THREAD_LOCAL_STORAGE=n +CONFIG_MINIMAL_LIBC=y diff --git a/tests/benchmarks/swap/src/main.c b/tests/benchmarks/swap/src/main.c new file mode 100644 index 0000000000000..7f9d21cb4e774 --- /dev/null +++ b/tests/benchmarks/swap/src/main.c @@ -0,0 +1,279 @@ +/* Copyright 2025 The ChromiumOS Authors + * SPDX-License-Identifier: Apache-2.0 + */ + +/* This app implements a simple microbenchmark of z_swap(), carefully + * tuned to measure only cooperative swapping performance and nothing + * else. Build it and run. No special usage needed. + * + * Subtle sequencing, see comments below. This runs without a timer + * driver (and in fact disables ARM SysTick so it can use it + * directly), controlling execution order via scheduler priority only. + */ + +#include +#include +#include + +/* Use a microtuned platform timing hook if available, otherwise k_cycle_get_32(). */ +#ifdef CONFIG_CPU_CORTEX_M +#include "time_arm_m.h" +#else +#include "time_generic.h" +#endif + +/* Check config for obvious mistakes. */ +#ifdef CONFIG_ASSERT +#warning "This is a performance benchmark, debug features should not normally be enabled" +#define ORDER_CHECK() printk("%s:%d\n", __func__, __LINE__) +#else +#define ORDER_CHECK() +#endif + +/* This ends up measuring a bunch of stuff not involved in arch code */ +#if defined(CONFIG_TIMESLICING) +#warning "Timeslicing can pollute the microbenchmark" +#endif + +/* And these absolutely explode the numbers, making the arch layer just noise */ +#if defined(CONFIG_MPU) || defined(CONFIG_MMU) +#error "Don't enable memory management hardware in a microbenchmark!" +#endif + +#if defined(CONFIG_FPU) +#error "Don't enable FPU/DSP in a microbenchmark!" +#endif + +#if defined(CONFIG_HW_STACK_PROTECTION) +#error "Don't enable hardware stack protection in a microbenchmark!" +#endif + +#define HI_PRIO 0 +#define LO_PRIO 1 +#define MAIN_PRIO 2 +#define DONE_PRIO 3 + +void thread_fn(void *t0_arg, void *t1_arg, void *c); + +int swap_count; + +/* swap enter/exit times for each thread */ +int t0_0, t0_1, t1_0, t1_1; + +uint32_t samples[8 * 1024]; + +struct k_spinlock lock; + +K_SEM_DEFINE(done_sem, 0, 999); +K_THREAD_DEFINE(thread0, 1024, thread_fn, &t0_0, &t0_1, NULL, HI_PRIO, 0, 0); +K_THREAD_DEFINE(thread1, 1024, thread_fn, &t1_0, &t1_1, NULL, HI_PRIO, 0, 0); + +void report(const char *name, size_t n) +{ + uint64_t total = 0; + + for (int i = 0; i < n; i++) { + total += samples[i]; + } + + float var = 0, avg = total / (float)n; + + for (int i = 0; i < n; i++) { + int d = samples[i] - avg; + + var += d * d; + } + + float stdev = sqrt(var / n); + int iavg = (int)(avg + 0.5f); + int stdev_i = (int)stdev; + int stdev_f = (int)(10 * (stdev - stdev_i)); + + printk("%s samples=%d average %d stdev %d.%d\n", name, n, iavg, stdev_i, stdev_f); +} + +void thread_fn(void *t0_arg, void *t1_arg, void *c) +{ + uint32_t t0, t1, *t0_out = t0_arg, *t1_out = t1_arg; + + while (true) { + k_spinlock_key_t k = k_spin_lock(&lock); + + k_thread_priority_set(k_current_get(), DONE_PRIO); + + ORDER_CHECK(); + t0 = time(); + z_swap(&lock, k); + t1 = time(); + + *t0_out = t0; + *t1_out = t1; + } +} + +void swap_bench(void) +{ + size_t n = ARRAY_SIZE(samples); + +#ifdef CONFIG_SMP + __ASSERT(arch_num_cpus() == 1, "Test requires only one CPU be active"); +#endif + + time_setup(); + k_thread_priority_set(k_current_get(), MAIN_PRIO); + + /* The threads are launched by the kernel at HI priority, so + * they've already run and are suspended in swap for us. + */ + for (int i = 0; i < n; i++) { + k_sched_lock(); + + ORDER_CHECK(); + k_thread_priority_set(thread0, HI_PRIO); + ORDER_CHECK(); + k_thread_priority_set(thread1, LO_PRIO); + ORDER_CHECK(); + + /* Now unlock: thread0 will run first, looping around + * and calling z_swap, with an entry timestamp of + * t0_0. That will swap to thread1 (which timestamps + * its exit as t1_1). Then we end up back here. + */ + ORDER_CHECK(); + k_sched_unlock(); + + /* And some complexity: thread1 "woke up" on our cycle + * and stored its exit time in t1_1. But thread0's + * entry time is still a local variable suspended on + * its stack. So we pump it once to get it to store + * its output. + */ + k_thread_priority_set(thread0, HI_PRIO); + + uint32_t dt = time_delta(t0_0, t1_1); + + samples[i] = dt; + } + + k_thread_abort(thread0); + k_thread_abort(thread1); + + report("SWAP", n); +} + +struct k_timer tm; + +K_SEM_DEFINE(hi_sem, 0, 9999); +uint32_t t_preempt; + +void hi_thread_fn(void *a, void *b, void *c) +{ + while (true) { + k_sem_take(&hi_sem, K_FOREVER); + t_preempt = k_cycle_get_32(); + } +} + +K_THREAD_DEFINE(hi_thread, 1024, hi_thread_fn, NULL, NULL, NULL, -1, 0, 0); + +void timer0_fn(struct k_timer *t) +{ + /* empty */ +} + +/* The hardware devices I have see excursions in interrupt latency I + * don't understand. It looks like more than one is queued up? Check + * that nothing looks weird and retry if it does. Doing this gives + * MUCH lower variance (like 2-3 cycle stdev and not 100+) + */ +bool retry(int i) +{ + if (i == 0) { + return false; + } + + uint32_t dt = samples[i], dt0 = samples[i - 1]; + + /* Check for >25% delta */ + if (dt > dt0 && (dt - dt0) > (dt0 / 4)) { + return true; + } + return false; +} + +void irq_bench(void) +{ + size_t n = MIN(ARRAY_SIZE(samples), 4 * CONFIG_SYS_CLOCK_TICKS_PER_SEC); + uint32_t t0, t1; + unsigned int key; + + k_timer_init(&tm, timer0_fn, NULL); + + for (int i = 0; i < n; i++) { + /* Lock interrupts before starting the timer, then + * wait for two (!) ticks to roll over to be sure the + * interrupt is queued. + */ + key = arch_irq_lock(); + k_timer_start(&tm, K_TICKS(0), K_FOREVER); + + k_busy_wait(k_ticks_to_us_ceil32(3)); + + /* Releasing the lock will fire the interrupt synchronously. */ + t0 = k_cycle_get_32(); + arch_irq_unlock(key); + t1 = k_cycle_get_32(); + + k_timer_stop(&tm); + + samples[i] = t1 - t0; + + if (retry(i)) { + i--; + } + } + report("IRQ", n); +} + +/* Very similar test, but switches to hi_thread and checks time on + * interrupt exit there, to measure preemption overhead + */ +void irq_p_bench(void) +{ + size_t n = MIN(ARRAY_SIZE(samples), 4 * CONFIG_SYS_CLOCK_TICKS_PER_SEC); + uint32_t t0; + unsigned int key; + + k_timer_init(&tm, timer0_fn, NULL); + + for (int i = 0; i < n; i++) { + key = arch_irq_lock(); + k_timer_start(&tm, K_TICKS(0), K_FOREVER); + + k_busy_wait(k_ticks_to_us_ceil32(3)); + + k_sem_give(&hi_sem); + t0 = k_cycle_get_32(); + arch_irq_unlock(key); + + k_timer_stop(&tm); + + samples[i] = t_preempt - t0; + + if (retry(i)) { + i--; + } + } + report("IRQ_P", n); +} + +int main(void) +{ + irq_bench(); + irq_p_bench(); + + /* This disables the SysTick interrupt and must be last! */ + swap_bench(); + + return 0; +} diff --git a/tests/benchmarks/swap/src/time_arm_m.h b/tests/benchmarks/swap/src/time_arm_m.h new file mode 100644 index 0000000000000..8f246bc7f6201 --- /dev/null +++ b/tests/benchmarks/swap/src/time_arm_m.h @@ -0,0 +1,24 @@ +/* Copyright 2025 The ChromiumOS Authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include + +static ALWAYS_INLINE void time_setup(void) +{ + /* Disable SysTick interrupts so the timer driver doesn't + * interfere, we want the full 24 bit space. + */ + SysTick->CTRL &= ~SysTick_CTRL_TICKINT_Msk; + SysTick->LOAD = 0xffffff; +} + +static ALWAYS_INLINE uint32_t time(void) +{ + return SysTick->VAL; +} + +static ALWAYS_INLINE uint32_t time_delta(uint32_t t0, uint32_t t1) +{ + /* SysTick counts down, not up! */ + return (t0 - t1) & 0xffffff; +} diff --git a/tests/benchmarks/swap/src/time_generic.h b/tests/benchmarks/swap/src/time_generic.h new file mode 100644 index 0000000000000..331797d731b88 --- /dev/null +++ b/tests/benchmarks/swap/src/time_generic.h @@ -0,0 +1,16 @@ +/* Copyright 2025 The ChromiumOS Authors + * SPDX-License-Identifier: Apache-2.0 + */ +static ALWAYS_INLINE void time_setup(void) +{ +} + +static ALWAYS_INLINE uint32_t time(void) +{ + return k_cycle_get_32(); +} + +static ALWAYS_INLINE uint32_t time_delta(uint32_t t0, uint32_t t1) +{ + return t1 - t0; +} diff --git a/tests/benchmarks/thread_metric/Kconfig b/tests/benchmarks/thread_metric/Kconfig index 72ebc3042dfbf..9ea39489461d2 100644 --- a/tests/benchmarks/thread_metric/Kconfig +++ b/tests/benchmarks/thread_metric/Kconfig @@ -30,7 +30,6 @@ config TM_INTERRUPT bool "Interrupt processing" select TEST select IRQ_OFFLOAD - select IRQ_OFFLOAD_NESTED help The interrupt processing benchmark has a single thread that causes an interrupt which results in its ISR incrementing a counter and then @@ -42,7 +41,6 @@ config TM_INTERRUPT_PREEMPTION bool "Interrupt processing preemption" select TEST select IRQ_OFFLOAD - select IRQ_OFFLOAD_NESTED help The interrupt preemption benchmark counts the number of times that an ISR from a software generated interrupt results in the preemption diff --git a/tests/benchmarks/thread_metric/prj.conf b/tests/benchmarks/thread_metric/prj.conf index df5648feb8704..bca6a64c16727 100644 --- a/tests/benchmarks/thread_metric/prj.conf +++ b/tests/benchmarks/thread_metric/prj.conf @@ -26,3 +26,9 @@ CONFIG_THREAD_LOCAL_STORAGE=n # Disable memory slab pointer validation CONFIG_MEM_SLAB_POINTER_VALIDATE=n + +# Turn off MPU explicitly, as some platforms enable it even when +# unused, and it impacts performance paths needlessly (on arm, ARM_MPU +# appears to select MPU, so you need to hit both) +CONFIG_MPU=n +CONFIG_ARM_MPU=n diff --git a/tests/kernel/mutex/mutex_api/src/test_mutex_apis.c b/tests/kernel/mutex/mutex_api/src/test_mutex_apis.c index 1a5c3211d452f..f3153aa0804b9 100644 --- a/tests/kernel/mutex/mutex_api/src/test_mutex_apis.c +++ b/tests/kernel/mutex/mutex_api/src/test_mutex_apis.c @@ -409,12 +409,8 @@ ZTEST_USER(mutex_api_1cpu, test_mutex_priority_inheritance) static void tThread_mutex_lock_should_fail(void *p1, void *p2, void *p3) { - k_timeout_t timeout; struct k_mutex *mutex = (struct k_mutex *)p1; - - timeout.ticks = 0; - timeout.ticks |= (uint64_t)(uintptr_t)p2 << 32; - timeout.ticks |= (uint64_t)(uintptr_t)p3 << 0; + k_timeout_t timeout = *(k_timeout_t *)p2; zassert_equal(-EAGAIN, k_mutex_lock(mutex, timeout), NULL); } @@ -440,30 +436,32 @@ static void tThread_mutex_lock_should_fail(void *p1, void *p2, void *p3) */ ZTEST(mutex_api_1cpu, test_mutex_timeout_race_during_priority_inversion) { - k_timeout_t timeout; - uintptr_t timeout_upper; - uintptr_t timeout_lower; - int helper_prio = k_thread_priority_get(k_current_get()) + 1; + const int low_prio = 2, helper_prio = 1, high_prio = 0; + + static ZTEST_DMEM k_timeout_t timeout; k_mutex_init(&tmutex); - /* align to tick boundary */ - k_sleep(K_TICKS(1)); + /* Set our priority to the "low" value so the thread can preempt us */ + k_thread_priority_set(k_current_get(), low_prio); - /* allow non-kobject data to be shared (via registers) */ timeout = K_TIMEOUT_ABS_TICKS(k_uptime_ticks() + CONFIG_TEST_MUTEX_API_THREAD_CREATE_TICKS); - timeout_upper = timeout.ticks >> 32; - timeout_lower = timeout.ticks & BIT64_MASK(32); k_mutex_lock(&tmutex, K_FOREVER); + + /* This thread will run immediately, preempt us, and boost our priority */ k_thread_create(&tdata, tstack, K_THREAD_STACK_SIZEOF(tstack), - tThread_mutex_lock_should_fail, &tmutex, (void *)timeout_upper, - (void *)timeout_lower, helper_prio, + tThread_mutex_lock_should_fail, &tmutex, &timeout, NULL, helper_prio, K_USER | K_INHERIT_PERMS, K_NO_WAIT); - k_thread_priority_set(k_current_get(), K_HIGHEST_THREAD_PRIO); - + /* Now we wait for the same absolute timeout the thread is + * blocked on, so we wake up simultaneously. But further + * boost our own priority so that we wake up first, creating + * the situation where the target thread sees both a timeout + * and an unlocked mutex simultaneously. + */ + k_thread_priority_set(k_current_get(), high_prio); k_sleep(timeout); k_mutex_unlock(&tmutex);