From 5635e5dc993421bab366f0d25d91f03fe7f47568 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 22 Jan 2025 08:38:12 -0800
Subject: [PATCH 01/27] arch: Remove FPU_SHARING dependence on MULTITHREADING

This was just a pedantic setting.  I mean, of course it makes no sense
to have thread FPU management state features built when you aren't
including the scheduler in the build.

...unless you want to unit-test the context switch code without
tripping over itself on the way into the test code.  In fact lots of
unit testing of low level primitives can be done with
MULTITHREADING=n.

Remove the dependency.  It isn't actually doing anything useful.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a179cc63b084c..5d4e7a90e7497 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1041,7 +1041,7 @@ config FPU
 
 config FPU_SHARING
 	bool "FPU register sharing"
-	depends on FPU && MULTITHREADING
+	depends on FPU
 	help
 	  This option enables preservation of the hardware floating point registers
 	  across context switches to allow multiple threads to perform concurrent

From 3be8eb1ed07abfc27e3e469be9ebc5c2e074d65d Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Sat, 1 Feb 2025 05:54:01 -0800
Subject: [PATCH 02/27] tests/kernel/mutex_api: Fix logic in PI race test

This test seemed confused.  Per the description, it's trying to create
a priority boost situation where a low priority thread (the main ztest
thread running test_mutex_timeout_race_during_priority_inversion())
holds a mutex and is boosted by a high priority thread (running
tThread_mutex_lock_should_fail()) trying to lock the mutex.

But that's not what the code was doing.  It was creating the thread at
a LOWER priority, so no priority inheritance was happening.  The test
would pass simply because the thread ordering via the natural
scheduler caused the spawned thread to run first.  Fix the priorities
to be explicit.  And add some comments explaining what's happening.

Also clean up the timeout whiteboxing, which apparently an attempt to
evade userspace protections on the stack.  Just use a shared pointer
to ZTEST_DMEM.  And remove a voodoo "align to tick boundary" sleep;
that's needless here as the timeout is explicitly constructed to be
simultaneous, we don't care when we go to sleep, only that we wake up
out of the same ISR.

Signed-off-by: Andy Ross <andyross@google.com>
---
 .../mutex/mutex_api/src/test_mutex_apis.c     | 34 +++++++++----------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/tests/kernel/mutex/mutex_api/src/test_mutex_apis.c b/tests/kernel/mutex/mutex_api/src/test_mutex_apis.c
index 1a5c3211d452f..f3153aa0804b9 100644
--- a/tests/kernel/mutex/mutex_api/src/test_mutex_apis.c
+++ b/tests/kernel/mutex/mutex_api/src/test_mutex_apis.c
@@ -409,12 +409,8 @@ ZTEST_USER(mutex_api_1cpu, test_mutex_priority_inheritance)
 
 static void tThread_mutex_lock_should_fail(void *p1, void *p2, void *p3)
 {
-	k_timeout_t timeout;
 	struct k_mutex *mutex = (struct k_mutex *)p1;
-
-	timeout.ticks = 0;
-	timeout.ticks |= (uint64_t)(uintptr_t)p2 << 32;
-	timeout.ticks |= (uint64_t)(uintptr_t)p3 << 0;
+	k_timeout_t timeout = *(k_timeout_t *)p2;
 
 	zassert_equal(-EAGAIN, k_mutex_lock(mutex, timeout), NULL);
 }
@@ -440,30 +436,32 @@ static void tThread_mutex_lock_should_fail(void *p1, void *p2, void *p3)
  */
 ZTEST(mutex_api_1cpu, test_mutex_timeout_race_during_priority_inversion)
 {
-	k_timeout_t timeout;
-	uintptr_t timeout_upper;
-	uintptr_t timeout_lower;
-	int helper_prio = k_thread_priority_get(k_current_get()) + 1;
+	const int low_prio = 2, helper_prio = 1, high_prio = 0;
+
+	static ZTEST_DMEM k_timeout_t timeout;
 
 	k_mutex_init(&tmutex);
 
-	/* align to tick boundary */
-	k_sleep(K_TICKS(1));
+	/* Set our priority to the "low" value so the thread can preempt us */
+	k_thread_priority_set(k_current_get(), low_prio);
 
-	/* allow non-kobject data to be shared (via registers) */
 	timeout = K_TIMEOUT_ABS_TICKS(k_uptime_ticks()
 		+ CONFIG_TEST_MUTEX_API_THREAD_CREATE_TICKS);
-	timeout_upper = timeout.ticks >> 32;
-	timeout_lower = timeout.ticks & BIT64_MASK(32);
 
 	k_mutex_lock(&tmutex, K_FOREVER);
+
+	/* This thread will run immediately, preempt us, and boost our priority */
 	k_thread_create(&tdata, tstack, K_THREAD_STACK_SIZEOF(tstack),
-			tThread_mutex_lock_should_fail, &tmutex, (void *)timeout_upper,
-			(void *)timeout_lower, helper_prio,
+			tThread_mutex_lock_should_fail, &tmutex, &timeout, NULL, helper_prio,
 			K_USER | K_INHERIT_PERMS, K_NO_WAIT);
 
-	k_thread_priority_set(k_current_get(), K_HIGHEST_THREAD_PRIO);
-
+	/* Now we wait for the same absolute timeout the thread is
+	 * blocked on, so we wake up simultaneously.  But further
+	 * boost our own priority so that we wake up first, creating
+	 * the situation where the target thread sees both a timeout
+	 * and an unlocked mutex simultaneously.
+	 */
+	k_thread_priority_set(k_current_get(), high_prio);
 	k_sleep(timeout);
 
 	k_mutex_unlock(&tmutex);

From c5a435f639d077e5901c537a4c5a486453fe0f50 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Sun, 2 Feb 2025 10:24:14 -0800
Subject: [PATCH 03/27] tests/arch/arm_thread_swap: Skip if USE_SWITCH=y

This is a low-level/whitebox test of the older context layer and can't
pass with the new code (which has its own unit test).

Signed-off-by: Andy Ross <andyross@google.com>
---
 tests/arch/arm/arm_thread_swap/testcase.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/arch/arm/arm_thread_swap/testcase.yaml b/tests/arch/arm/arm_thread_swap/testcase.yaml
index e26d88429050f..bf04663e7a03a 100644
--- a/tests/arch/arm/arm_thread_swap/testcase.yaml
+++ b/tests/arch/arm/arm_thread_swap/testcase.yaml
@@ -1,5 +1,6 @@
 common:
   filter: CONFIG_ARMV6_M_ARMV8_M_BASELINE or CONFIG_ARMV7_M_ARMV8_M_MAINLINE
+          and not CONFIG_USE_SWITCH
   tags:
     - arm
     - fpu

From 3927254df7a1a44e667dfd6f3135aff5dd5bad7d Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Tue, 4 Feb 2025 09:55:32 -0800
Subject: [PATCH 04/27] arch/arm: New arch_switch() based context layer for
 Cortex M:

1. Mostly complete.  Supports MPU, userspace, PSPLIM-based stack
guards, and FPU/DSP features.  ARMv8-M secure mode "should" work but I
don't know how to test it.

2. Designed with an eye to uncompromising/best-in-industry cooperative
context switch performance.  No PendSV exception nor hardware
stacking/unstacking, just a traditional "musical chairs" switch.
Context gets saved on process stacks only instead of split between
there and the thread struct.  No branches in the core integer switch
code (and just one in the FPU bits that can't be avoided).

3. Minimal assembly use; arch_switch() itself is ALWAYS_INLINE, there
is an assembly stub for exception exit, and that's it beyond one/two
instruction inlines elsewhere.

4. Selectable at build time, interoperable with existing code.  Just
use the pre-existing CONFIG_USE_SWITCH=y flag to enable it.  Or turn
it off to evade regressions as this stabilizes.

5. Exception/interrupt returns in the common case need only a single C
function to be called at the tail, and then return naturally.
Effectively "all interrupts are direct now".  This isn't a benefit
currently because the existing stubs haven't been removed (see #4),
but in the long term we can look at exploiting this.  The boilerplate
previously required is now (mostly) empty.

6. No support for ARMv6 (Cortex M0 et. al.) thumb code.  The expanded
instruction encodings in ARMv7 are a big (big) win, so the older cores
really need a separate port to avoid impacting newer hardware.
Thankfully there isn't that much code to port (see #3), so this should
be doable.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/CMakeLists.txt  |   2 +
 arch/arm/core/cortex_m/arm-m-switch.c  | 459 +++++++++++++++++++++++++
 include/zephyr/arch/arm/arm-m-switch.h | 226 ++++++++++++
 3 files changed, 687 insertions(+)
 create mode 100644 arch/arm/core/cortex_m/arm-m-switch.c
 create mode 100644 include/zephyr/arch/arm/arm-m-switch.h

diff --git a/arch/arm/core/cortex_m/CMakeLists.txt b/arch/arm/core/cortex_m/CMakeLists.txt
index 3de8e544bc121..bfa3ff8887381 100644
--- a/arch/arm/core/cortex_m/CMakeLists.txt
+++ b/arch/arm/core/cortex_m/CMakeLists.txt
@@ -35,6 +35,8 @@ zephyr_library_sources(
   cpu_idle.c
   )
 
+zephyr_library_sources_ifdef(CONFIG_USE_SWITCH arm-m-switch.c)
+
 zephyr_library_sources_ifndef(CONFIG_ARM_CUSTOM_INTERRUPT_CONTROLLER irq_init.c)
 zephyr_library_sources_ifdef(CONFIG_GEN_SW_ISR_TABLE isr_wrapper.c)
 zephyr_library_sources_ifdef(CONFIG_DEBUG_COREDUMP coredump.c)
diff --git a/arch/arm/core/cortex_m/arm-m-switch.c b/arch/arm/core/cortex_m/arm-m-switch.c
new file mode 100644
index 0000000000000..088b8fb0a7c21
--- /dev/null
+++ b/arch/arm/core/cortex_m/arm-m-switch.c
@@ -0,0 +1,459 @@
+/* Copyright 2025 The ChromiumOS Authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <zephyr/sys/util.h>
+#include <ksched.h>
+
+/* The basic exception frame, popped by the hardware during return */
+struct hw_frame_base {
+	uint32_t r0, r1, r2, r3;
+	uint32_t r12;
+	uint32_t lr;
+	uint32_t pc;
+	uint32_t apsr;
+};
+
+/* The hardware frame pushed when entry is taken with FPU active */
+struct hw_frame_fpu {
+	struct hw_frame_base base;
+	uint32_t s_regs[16];
+	uint32_t fpscr;
+	uint32_t reserved;
+};
+
+/* The hardware frame pushed when entry happens with a misaligned stack */
+struct hw_frame_align {
+	struct hw_frame_base base;
+	uint32_t align_pad;
+};
+
+/* Both of the above */
+struct hw_frame_align_fpu {
+	struct hw_frame_fpu base;
+	uint32_t align_pad;
+};
+
+/* Zephyr's synthesized frame used during context switch on interrupt
+ * exit.  It's a minimal hardware frame plus storage for r4-11.
+ */
+struct synth_frame {
+	uint32_t r7, r8, r9, r10, r11;
+	uint32_t r4, r5, r6; /* these match switch format */
+	struct hw_frame_base base;
+};
+
+/* Zephyr's custom frame used for suspended threads, not hw-compatible */
+struct switch_frame {
+#ifdef CONFIG_BUILTIN_STACK_GUARD
+	uint32_t psplim;
+#endif
+	uint32_t apsr;
+	uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12;
+	uint32_t lr;
+	uint32_t pc;
+};
+
+/* Union of synth and switch frame, used during context switch */
+union u_frame {
+	struct {
+		char pad[sizeof(struct switch_frame) - sizeof(struct synth_frame)];
+		struct synth_frame hw;
+	};
+	struct switch_frame sw;
+};
+
+/* u_frame with have_fpu flag prepended (zero value), no FPU state */
+struct z_frame {
+#ifdef CONFIG_FPU
+	uint32_t have_fpu;
+#endif
+	union u_frame u;
+};
+
+/* u_frame + FPU data, with have_fpu (non-zero) */
+struct z_frame_fpu {
+	uint32_t have_fpu;
+	uint32_t s_regs[32];
+	uint32_t fpscr;
+	union u_frame u;
+};
+
+/* Union of all possible stack frame formats, aligned at the top (!).
+ * Note that FRAMESZ is constructed to be larger than any of them to
+ * avoid having a zero-length array.  The code doesn't ever use the
+ * size of this struct, it just wants to be have compiler-visible
+ * offsets for in-place copies.
+ */
+/* clang-format off */
+#define FRAMESZ (4 + MAX(sizeof(struct z_frame_fpu), sizeof(struct hw_frame_align_fpu)))
+#define PAD(T)  char pad_##T[FRAMESZ - sizeof(struct T)]
+union frame {
+	struct { PAD(hw_frame_base);      struct hw_frame_base hw;          };
+	struct { PAD(hw_frame_fpu);       struct hw_frame_fpu hwfp;         };
+	struct { PAD(hw_frame_align);     struct hw_frame_align hw_a;       };
+	struct { PAD(hw_frame_align_fpu); struct hw_frame_align_fpu hwfp_a; };
+	struct { PAD(z_frame);            struct z_frame z;                 };
+	struct { PAD(z_frame_fpu);        struct z_frame_fpu zfp;           };
+};
+/* clang-format on */
+
+#ifdef __GNUC__
+/* Validate the structs are correctly top-aligned */
+#define FRAME_FIELD_END(F) ((void *)&(&(((union frame *)0)->F))[1])
+BUILD_ASSERT(FRAME_FIELD_END(hw) == FRAME_FIELD_END(hwfp));
+BUILD_ASSERT(FRAME_FIELD_END(hw) == FRAME_FIELD_END(hw_a));
+BUILD_ASSERT(FRAME_FIELD_END(hw) == FRAME_FIELD_END(hwfp_a));
+BUILD_ASSERT(FRAME_FIELD_END(hw) == FRAME_FIELD_END(z));
+BUILD_ASSERT(FRAME_FIELD_END(hw) == FRAME_FIELD_END(zfp));
+#endif
+
+#ifdef CONFIG_FPU
+uint32_t arm_m_switch_stack_buffer = sizeof(struct z_frame_fpu) - sizeof(struct hw_frame_base);
+#else
+uint32_t arm_m_switch_stack_buffer = sizeof(struct z_frame) - sizeof(struct hw_frame_base);
+#endif
+
+struct arm_m_cs_ptrs arm_m_cs_ptrs;
+
+#ifdef CONFIG_LTO
+/* Toolchain workaround: when building with LTO, gcc seems unable to
+ * notice the external references in the assembly for arm_m_exc_exit
+ * below, and drops the symbols before the final link.  Use this
+ * global to store pointers in arm_m_new_stack(), wasting a few bytes
+ * of code & data.
+ */
+void *arm_m_lto_refs[2];
+#endif
+
+/* Unit test hook, unused in production */
+void *arm_m_last_switch_handle;
+
+/* Global holder for the location of the saved LR in the entry frame. */
+uint32_t *arm_m_exc_lr_ptr;
+
+/* Dummy used in arch_switch() when USERSPACE=y */
+uint32_t arm_m_switch_control;
+
+/* clang-format off */
+/* Emits an in-place copy from a hw_frame_base to a switch_frame */
+#define HW_TO_SWITCH(hw, sw) do {					\
+	uint32_t r0 = hw.r0, r1 = hw.r1, r2 = hw.r2, r3 = hw.r3;        \
+	uint32_t r12 = hw.r12, lr = hw.lr, pc = hw.pc, apsr = hw.apsr;  \
+	pc |= 1; /* thumb bit! */                                       \
+	sw.r0 = r0; sw.r1 = r1; sw.r2 = r2; sw.r3 = r3;                 \
+	sw.r12 = r12; sw.lr = lr; sw.pc = pc; sw.apsr = apsr;           \
+} while (false)
+
+/* Emits an in-place copy from a switch_frame to a synth_frame */
+#define SWITCH_TO_SYNTH(sw, syn) do {					\
+	struct synth_frame syntmp = {                                   \
+		.r4 = sw.r4, .r5 = sw.r5, .r6 = sw.r6, .r7 = sw.r7,     \
+		.r8 = sw.r8, .r9 = sw.r9, .r10 = sw.r10, .r11 = sw.r11, \
+		.base.r0 = sw.r0, .base.r1 = sw.r1, .base.r2 = sw.r2,   \
+		.base.r3 = sw.r3, .base.r12 = sw.r12, .base.lr = sw.lr, \
+		.base.pc = sw.pc, .base.apsr = sw.apsr,                 \
+	};                                                              \
+	syn = syntmp;                                                   \
+} while (false)
+/* clang-format on */
+
+/* Reports if the passed return address is a valid EXC_RETURN (high
+ * four bits set) that will restore to the PSP running in thread mode
+ * (low four bits == 0xd).  That is an interrupted Zephyr thread
+ * context.  For everything else, we just return directly via the
+ * hardware-pushed stack frame with no special handling. See ARMv7M
+ * manual B1.5.8.
+ */
+static bool is_thread_return(uint32_t lr)
+{
+	return (lr & 0xf000000f) == 0xf000000d;
+}
+
+/* Returns true if the EXC_RETURN address indicates a FPU subframe was
+ * pushed to the stack.  See ARMv7M manual B1.5.8.
+ */
+static bool fpu_state_pushed(uint32_t lr)
+{
+	return IS_ENABLED(CONFIG_CPU_HAS_FPU) ? !(lr & 0x10) : false;
+}
+
+#ifdef CONFIG_BUILTIN_STACK_GUARD
+#define PSPLIM(f) ((f)->z.u.sw.psplim)
+#else
+#define PSPLIM(f) 0
+#endif
+
+/* Converts, in place, a pickled "switch" frame from a suspended
+ * thread to a "synthesized" format that can be restored by the CPU
+ * hardware on exception exit.
+ */
+static void *arm_m_switch_to_cpu(void *sp)
+{
+	union frame *f;
+	uint32_t splim;
+
+#ifdef CONFIG_FPU
+	/* When FPU switching is enabled, the suspended handle always
+	 * points to the have_fpu word, which will be followed by FPU
+	 * state if non-zero.
+	 */
+	bool have_fpu = (*(uint32_t *)sp) != 0;
+
+	if (have_fpu) {
+		f = CONTAINER_OF(sp, union frame, zfp.have_fpu);
+		splim = PSPLIM(f);
+		__asm__ volatile("vldm %0, {s0-s31}" ::"r"(&f->zfp.s_regs[0]));
+		SWITCH_TO_SYNTH(f->zfp.u.sw, f->zfp.u.hw);
+	} else {
+		f = CONTAINER_OF(sp, union frame, z.have_fpu);
+		splim = PSPLIM(f);
+		SWITCH_TO_SYNTH(f->z.u.sw, f->zfp.u.hw);
+	}
+#else
+	f = CONTAINER_OF(sp, union frame, z.u.sw);
+	splim = PSPLIM(f);
+	SWITCH_TO_SYNTH(f->z.u.sw, f->z.u.hw);
+#endif
+
+#ifdef CONFIG_BUILTIN_STACK_GUARD
+	__asm__ volatile("msr psplim, %0" ::"r"(splim));
+#endif
+
+	/* Mark the callee-saved pointer for the fixup assembly.  Note
+	 * funny layout that puts r7 first!
+	 */
+	arm_m_cs_ptrs.in = &f->z.u.hw.r7;
+
+	return &f->z.u.hw.base;
+}
+
+static void fpu_cs_copy(struct hw_frame_fpu *src, struct z_frame_fpu *dst)
+{
+	for (int i = 0; IS_ENABLED(CONFIG_FPU) && i < 16; i++) {
+		dst->s_regs[i] = src->s_regs[i];
+	}
+}
+
+/* Converts, in-place, a CPU-spilled ("hardware") exception entry
+ * frame to our ("zephyr") switch handle format such that the thread
+ * can be suspended
+ */
+static void *arm_m_cpu_to_switch(struct k_thread *th, void *sp, bool fpu)
+{
+	union frame *f = NULL;
+	struct hw_frame_base *base = sp;
+	bool padded = (base->apsr & 0x200);
+	uint32_t fpscr;
+
+	if (fpu && IS_ENABLED(CONFIG_FPU)) {
+		uint32_t dummy = 0;
+
+		/* Lazy FPU stacking is enabled, so before we touch
+		 * the stack frame we have to tickle the FPU to force
+		 * it to spill the caller-save registers.  Then clear
+		 * CONTROL.FPCA which gets set again by that instruction.
+		 */
+		__asm__ volatile("vmov %0, s0;"
+				 "mrs %0, control;"
+				 "bic %0, %0, #4;"
+				 "msr control, %0;" ::"r"(dummy));
+	}
+
+	if (IS_ENABLED(CONFIG_FPU) && fpu) {
+		fpscr = CONTAINER_OF(sp, struct hw_frame_fpu, base)->fpscr;
+	}
+
+	/* There are four (!) different offsets from the interrupted
+	 * stack at which the hardware frame might be found at
+	 * runtime.  These expansions let the compiler generate
+	 * optimized in-place copies for each.  In practice it does a
+	 * pretty good job, much better than a double-copy via an
+	 * intermediate buffer.  Note that when FPU state is spilled
+	 * we must copy the 16 spilled registers first, to make room
+	 * for the copy.
+	 */
+	if (!fpu && !padded) {
+		f = CONTAINER_OF(sp, union frame, hw.r0);
+		HW_TO_SWITCH(f->hw, f->z.u.sw);
+	} else if (!fpu && padded) {
+		f = CONTAINER_OF(sp, union frame, hw_a.base.r0);
+		HW_TO_SWITCH(f->hw_a.base, f->z.u.sw);
+	} else if (fpu && !padded) {
+		f = CONTAINER_OF(sp, union frame, hwfp.base.r0);
+		fpu_cs_copy(&f->hwfp, &f->zfp);
+		HW_TO_SWITCH(f->hwfp.base, f->z.u.sw);
+	} else if (fpu && padded) {
+		f = CONTAINER_OF(sp, union frame, hwfp_a.base.base.r0);
+		fpu_cs_copy(&f->hwfp_a.base, &f->zfp);
+		HW_TO_SWITCH(f->hwfp_a.base.base, f->z.u.sw);
+	}
+
+#ifdef CONFIG_BUILTIN_STACK_GUARD
+	__asm__ volatile("mrs %0, psplim" : "=r"(f->z.u.sw.psplim));
+#endif
+
+	/* Mark the callee-saved pointer for the fixup assembly */
+	arm_m_cs_ptrs.out = &f->z.u.sw.r4;
+
+#ifdef CONFIG_FPU
+	if (fpu) {
+		__asm__ volatile("vstm %0, {s16-s31}" ::"r"(&f->zfp.s_regs[16]));
+		f->zfp.fpscr = fpscr;
+		f->zfp.have_fpu = true;
+		return &f->zfp.have_fpu;
+	}
+	f->z.have_fpu = false;
+	return &f->z.have_fpu;
+#else
+	return &f->z.u.sw;
+#endif
+}
+
+void *arm_m_new_stack(char *base, uint32_t sz, void *entry, void *arg0, void *arg1, void *arg2,
+		      void *arg3)
+{
+	struct switch_frame *sw;
+	uint32_t baddr;
+
+#ifdef CONFIG_LTO
+	arm_m_lto_refs[0] = &arm_m_cs_ptrs;
+	arm_m_lto_refs[1] = arm_m_must_switch;
+#endif
+
+#ifdef CONFIG_MULTITHREADING
+	/* Kludgey global initialization, stash computed pointers to
+	 * the LR frame location and fixup address into these
+	 * variables for use by arm_m_exc_tail().  Should move to arch
+	 * init somewhere once arch_switch is better integrated
+	 */
+	char *stack = (char *)K_KERNEL_STACK_BUFFER(z_interrupt_stacks[0]);
+	uint32_t *s_top = (uint32_t *)(stack + K_KERNEL_STACK_SIZEOF(z_interrupt_stacks[0]));
+
+	arm_m_exc_lr_ptr = &s_top[-1];
+	arm_m_cs_ptrs.lr_fixup = (void *) (1 | (uint32_t)arm_m_exc_exit); /* thumb bit! */
+#endif
+
+	baddr = ((uint32_t)base + 7) & ~7;
+
+	sz = ((uint32_t)(base + sz) - baddr) & ~7;
+
+	if (sz < sizeof(struct switch_frame)) {
+		return NULL;
+	}
+
+	/* Note: a useful trick here would be to initialize LR to
+	 * point to cleanup code, avoiding the need for the
+	 * z_thread_entry wrapper, saving a few words of stack frame
+	 * and a few cycles on thread entry.
+	 */
+	sw = (void *)(baddr + sz - sizeof(*sw));
+	*sw = (struct switch_frame){
+		IF_ENABLED(CONFIG_BUILTIN_STACK_GUARD, (.psplim = baddr,)) .r0 = (uint32_t)arg0,
+			   .r1 = (uint32_t)arg1, .r2 = (uint32_t)arg2, .r3 = (uint32_t)arg3,
+			   .pc = ((uint32_t)entry) | 1, /* set thumb bit! */
+			   .apsr = 0x1000000,           /* thumb bit here too! */
+		};
+
+#ifdef CONFIG_FPU
+	struct z_frame *zf = CONTAINER_OF(sw, struct z_frame, u.sw);
+
+	zf->have_fpu = false;
+	return zf;
+#else
+	return sw;
+#endif
+}
+
+bool arm_m_must_switch(uint32_t lr)
+{
+	struct k_thread *last_thread = NULL;
+
+	if (IS_ENABLED(CONFIG_MULTITHREADING)) {
+		last_thread = _current;
+	}
+
+	if (!is_thread_return(lr)) {
+		return false;
+	}
+
+	/* This lock is held until the end of the context switch (see
+	 * arch_switch and arm_m_exc_exit)
+	 */
+	unsigned int key = arch_irq_lock();
+
+	void *last, *next = z_get_next_switch_handle(NULL);
+
+	if (next == NULL) {
+		arch_irq_unlock(key);
+		return false;
+	}
+
+	bool fpu = fpu_state_pushed(lr);
+
+	__asm__ volatile("mrs %0, psp" : "=r"(last));
+	last = arm_m_cpu_to_switch(last_thread, last, fpu);
+	next = arm_m_switch_to_cpu(next);
+	__asm__ volatile("msr psp, %0" ::"r"(next));
+
+#if !defined(CONFIG_MULTITHREADING)
+	arm_m_last_switch_handle = last;
+#elif defined(CONFIG_USE_SWITCH)
+	last_thread->switch_handle = last;
+#endif
+
+#ifdef CONFIG_USERSPACE
+	uint32_t control;
+
+	__asm__ volatile("mrs %0, control" : "=r"(control));
+	last_thread->arch.mode &= (~1) | (control & 1);
+	control = (control & ~1) | (_current->arch.mode & 1);
+	__asm__ volatile("msr control, %0" ::"r"(control));
+#endif
+
+#if defined(CONFIG_USERSPACE) || defined(CONFIG_MPU_STACK_GUARD)
+	z_arm_configure_dynamic_mpu_regions(_current);
+#endif
+
+#ifdef CONFIG_THREAD_LOCAL_STORAGE
+	z_arm_tls_ptr = _current->tls;
+#endif
+	return true;
+}
+
+/* This is handled an inline now for C code, but there are a few spots
+ * that need to get to it from assembly (but which IMHO should really
+ * be ported to C)
+ */
+void arm_m_legacy_exit(void)
+{
+	arm_m_exc_tail();
+}
+
+/* We arrive here on return to thread code from exception handlers.
+ * We know that r4-r11 of the interrupted thread have been restored
+ * (other registers will be forgotten and can be clobbered).  First
+ * call arm_m_must_switch() (which handles the other context switch
+ * duties), and spill/fill if necessary.  If no context switch is
+ * needed, we just return via the original LR.  If we are switching,
+ * we synthesize a integer-only EXC_RETURN as FPU state switching was
+ * handled in software already.
+ */
+#ifdef CONFIG_MULTITHREADING
+__asm__(".globl arm_m_exc_exit;"
+	"arm_m_exc_exit:;"
+	"  ldr r2, =arm_m_cs_ptrs;"
+	"  ldr r0, [r2, #8];"    /* lr_save as argument */
+	"  bl arm_m_must_switch;"
+	"  ldr r2, =arm_m_cs_ptrs;"
+	"  ldr lr, [r2, #8];"    /* refetch lr_save as default lr */
+	"  cbz r0, 1f;"
+	"  ldm r2, {r0, r1};"    /* fields: out, in */
+	"  mov lr, #0xfffffffd;" /* integer-only LR */
+	"  stm r0, {r4-r11};"    /* out is a switch_frame */
+	"  ldm r1!, {r7-r11};"   /* in is a synth_frame */
+	"  ldm r1, {r4-r6};"
+	"  mov r1, #0;"
+	"  msr basepri, r1;"     /* release lock taken in must_switch */
+	"1:\n"
+	"  bx lr;");
+#endif
diff --git a/include/zephyr/arch/arm/arm-m-switch.h b/include/zephyr/arch/arm/arm-m-switch.h
new file mode 100644
index 0000000000000..2f6de26b63f68
--- /dev/null
+++ b/include/zephyr/arch/arm/arm-m-switch.h
@@ -0,0 +1,226 @@
+/* Copyright 2025 The ChromiumOS Authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _ZEPHYR_ARCH_ARM_M_SWITCH_H
+#define _ZEPHYR_ARCH_ARM_M_SWITCH_H
+
+#include <stdint.h>
+#include <zephyr/kernel_structs.h>
+#include <zephyr/kernel/thread.h>
+
+/* GCC/gas has a code generation bugglet on thumb.  The R7 register is
+ * the ABI-defined frame pointer, though it's usually unused in zephyr
+ * due to -fomit-frame-pointer (and the fact the DWARF on ARM doesn't
+ * really need it).  But when it IS enabled, which sometimes seems to
+ * happen due to toolchain internals, GCC is unable to allow its use
+ * in the clobber list of an asm() block (I guess it can't generate
+ * spill/fill code without using the frame?).
+ *
+ * When absolutely needed, this kconfig unmasks a workaround where we
+ * spill/fill R7 around the switch manually.
+ */
+#ifdef CONFIG_ARM_GCC_FP_WORKAROUND
+#define _R7_CLOBBER_OPT(expr) expr
+#else
+#define _R7_CLOBBER_OPT(expr) /**/
+#endif
+
+/* Should probably be in kconfig, basically this is testing whether or
+ * not the toolchain will allow a "g" flag (DSP state) to an "msr
+ * adsp_" instruction.
+ */
+#if defined(CONFIG_CPU_CORTEX_M4) || defined(CONFIG_CPU_CORTEX_M7) || defined(CONFIG_ARMV8_M_DSP)
+#define _ARM_M_SWITCH_HAVE_DSP
+#endif
+
+void *arm_m_new_stack(char *base, uint32_t sz, void *entry, void *arg0, void *arg1, void *arg2,
+		      void *arg3);
+
+bool arm_m_must_switch(void);
+
+void arm_m_exc_exit(void);
+
+extern uint32_t *arm_m_exc_lr_ptr;
+
+void z_arm_configure_dynamic_mpu_regions(struct k_thread *thread);
+
+extern uintptr_t z_arm_tls_ptr;
+
+extern uint32_t arm_m_switch_stack_buffer;
+
+/* Global pointers to the frame locations for the callee-saved
+ * registers.  Set in arm_m_must_switch(), and used by the fixup
+ * assembly in arm_m_exc_exit.
+ */
+struct arm_m_cs_ptrs {
+	void *out, *in, *lr_save, *lr_fixup;
+};
+extern struct arm_m_cs_ptrs arm_m_cs_ptrs;
+
+static inline void arm_m_exc_tail(void)
+{
+#ifdef CONFIG_MULTITHREADING
+	/* Dirty trickery.  We defer as much interrupt-exit work until
+	 * the very last moment, when the top-level ISR returns back
+	 * into user code.  We do this by replacing the topmost (!) LR
+	 * return address in the stack frame with our fixup code at
+	 * arm_m_exc_exit().  By running after the ISR return, it
+	 * knows that the callee-save registers r4-r11 (which need to
+	 * be saved to the outgoing thread) are restored.
+	 *
+	 * Obviously this only works if the ISR is "ABI-compliant
+	 * enough".  It doesn't have to have pushed a complete frame,
+	 * but it does have to have put LR into its standard location.
+	 * In practice generated code does (because it has to store LR
+	 * somewhere so it can call other functions and then pop it to
+	 * return), so this works even on code built with
+	 * -fomit-frame-pointer.  If an app needs a direct interrupt
+	 * and can't meet these requirents, it can always skip this
+	 * call and return directly (reschedule is optional for direct
+	 * interrupts anyway).
+	 *
+	 * Finally note the call to check_stack_sentinel here: that is
+	 * normally called from context switch at the end, but will
+	 * toss an exception, which we can't allow (without hardship)
+	 * on the path from here to interrupt exit.  It will mess up
+	 * our bookeeping around EXC_RETURN, so do it early.
+	 */
+	void z_check_stack_sentinel(void);
+	void *isr_lr = (void *) *arm_m_exc_lr_ptr;
+
+	if (IS_ENABLED(CONFIG_STACK_SENTINEL)) {
+		z_check_stack_sentinel();
+	}
+	if (isr_lr != arm_m_cs_ptrs.lr_fixup) {
+		arm_m_cs_ptrs.lr_save = isr_lr;
+		*arm_m_exc_lr_ptr = (uint32_t) arm_m_cs_ptrs.lr_fixup;
+	}
+#endif
+}
+
+static ALWAYS_INLINE void arm_m_switch(void *switch_to, void **switched_from)
+{
+#if defined(CONFIG_USERSPACE) || defined(CONFIG_MPU_STACK_GUARD)
+	z_arm_configure_dynamic_mpu_regions(_current);
+#endif
+
+#ifdef CONFIG_THREAD_LOCAL_STORAGE
+	z_arm_tls_ptr = _current->tls;
+#endif
+
+#if defined(CONFIG_USERSPACE) && defined(CONFIG_USE_SWITCH)
+	/* Need to manage CONTROL.nPRIV bit.  We know the outgoing
+	 * thread is in privileged mode (because you can't reach a
+	 * context switch unless you're in the kernel!).
+	 */
+	extern uint32_t arm_m_switch_control;
+	uint32_t control;
+	struct k_thread *old = CONTAINER_OF(switched_from, struct k_thread, switch_handle);
+
+	old->arch.mode &= ~1;
+	__asm__ volatile("mrs %0, control" : "=r"(control));
+	__ASSERT_NO_MSG((control & 1) == 0);
+	arm_m_switch_control = (control & ~1) | (_current->arch.mode & 1);
+#endif
+
+	/* new switch handle in r4, old switch handle pointer in r5.
+	 * r6-r8 are used by the code here, and r9-r11 are
+	 * unsaved/clobbered (they are very likely to be caller-saved
+	 * registers in the enclosing function that the compiler can
+	 * avoid using, i.e. we can let it make the call and avoid a
+	 * double-spill).  But all registers are restored fully
+	 * (because we might be switching to an interrupt-saved frame)
+	 */
+	register uint32_t r4 __asm__("r4") = (uint32_t)switch_to;
+	register uint32_t r5 __asm__("r5") = (uint32_t)switched_from;
+	__asm__ volatile(
+		/* Construct and push a {r12, lr, pc} group at the top
+		 * of the frame, where PC points to the final restore location
+		 * at the end of this sequence.
+		 */
+		"mov r6, r12;"
+		"mov r7, lr;"
+		"ldr r8, =3f;"    /* address of restore PC */
+		"add r8, r8, #1;" /* set thumb bit */
+		"push {r6-r8};"
+		"sub sp, sp, #24;" /* skip over space for r6-r11 */
+		"push {r0-r5};"
+		"mov r2, #0x01000000;" /* APSR (only care about thumb bit) */
+		"mov r0, #0;"          /* Leave r0 zero for code blow */
+#ifdef CONFIG_BUILTIN_STACK_GUARD
+		"mrs r1, psplim;"
+		"push {r1-r2};"
+		"msr psplim, r0;" /* zero it so we can move the stack */
+#else
+		"push {r2};"
+#endif
+
+#ifdef CONFIG_FPU
+		/* Push FPU state (if active) to our outgoing stack */
+		"   mrs r8, control;" /* read CONTROL.FPCA */
+		"   and r7, r8, #4;"  /* r7 == have_fpu */
+		"   cbz r7, 1f;"
+		"   bic r8, r8, #4;" /* clear CONTROL.FPCA */
+		"   msr control, r8;"
+		"   vmrs r6, fpscr;"
+		"   push {r6};"
+		"   vpush {s0-s31};"
+		"1: push {r7};" /* have_fpu word */
+
+		/* Pop FPU state (if present) from incoming frame in r4 */
+		"   ldm r4!, {r7};" /* have_fpu word */
+		"   cbz r7, 2f;"
+		"   vldm r4!, {s0-s31};" /* (note: sets FPCA bit for us) */
+		"   ldm r4!, {r6};"
+		"   vmsr fpscr, r6;"
+		"2:;"
+#endif
+
+#if defined(CONFIG_USERSPACE) && defined(CONFIG_USE_SWITCH)
+		"  ldr r8, =arm_m_switch_control;"
+		"  ldr r8, [r8];"
+		"  msr control, r8;"
+#endif
+
+		/* Save the outgoing switch handle (which is SP), swap stacks,
+		 * and enable interrupts.  The restore process is
+		 * interruptible code (running in the incoming thread) once
+		 * the stack is valid.
+		 */
+		"str sp, [r5];"
+		"mov sp, r4;"
+		"msr basepri, r0;"
+
+	/* Restore is super simple: pop the flags (and stack limit if
+	 * enabled) then slurp in the whole GPR set in two
+	 * instructions. (The instruction encoding disallows popping
+	 * both LR and PC in a single instruction)
+	 */
+#ifdef CONFIG_BUILTIN_STACK_GUARD
+		"pop {r1-r2};"
+		"msr psplim, r1;"
+#else
+		"pop {r2};"
+#endif
+#ifdef _ARM_M_SWITCH_HAVE_DSP
+		"msr apsr_nzcvqg, r2;" /* bonkers syntax */
+#else
+		"msr apsr_nzcvq, r2;" /* not even source-compatible! */
+#endif
+		"pop {r0-r12, lr};"
+		"pop {pc};"
+
+		"3:" /* Label for restore address */
+		::"r"(r4),
+		"r"(r5)
+		: "r6", "r7", "r8", "r9", "r10", "r11");
+}
+
+#ifdef CONFIG_USE_SWITCH
+static ALWAYS_INLINE void arch_switch(void *switch_to, void **switched_from)
+{
+	arm_m_switch(switch_to, switched_from);
+}
+#endif
+
+#endif /* _ZEPHYR_ARCH_ARM_M_SWITCH_H */

From 54ec18c9f8110b2206b2b47284bf274f4b03b64e Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Tue, 4 Feb 2025 09:56:42 -0800
Subject: [PATCH 05/27] tests/arch/arm: Add unit test for new arch_switch()
 implementation

This is a standalone unit test of the new context code, which builds
in isolation as a MULTITHREADING=n binary.  No integration with the
core arch layer is required.

Signed-off-by: Andy Ross <andyross@google.com>
---
 tests/arch/arm/arm_switch/CMakeLists.txt |  11 ++
 tests/arch/arm/arm_switch/prj.conf       |   5 +
 tests/arch/arm/arm_switch/src/main.c     | 205 +++++++++++++++++++++++
 tests/arch/arm/arm_switch/testcase.yaml  |   6 +
 4 files changed, 227 insertions(+)
 create mode 100644 tests/arch/arm/arm_switch/CMakeLists.txt
 create mode 100644 tests/arch/arm/arm_switch/prj.conf
 create mode 100644 tests/arch/arm/arm_switch/src/main.c
 create mode 100644 tests/arch/arm/arm_switch/testcase.yaml

diff --git a/tests/arch/arm/arm_switch/CMakeLists.txt b/tests/arch/arm/arm_switch/CMakeLists.txt
new file mode 100644
index 0000000000000..f78d1c206421f
--- /dev/null
+++ b/tests/arch/arm/arm_switch/CMakeLists.txt
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20.0)
+
+find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE})
+project(arm_switch)
+
+target_sources(app PRIVATE src/main.c)
+
+# Need to unit test arch headers
+target_include_directories(app PRIVATE ${ZEPHYR_BASE}/arch/${ARCH}/include)
diff --git a/tests/arch/arm/arm_switch/prj.conf b/tests/arch/arm/arm_switch/prj.conf
new file mode 100644
index 0000000000000..45473f9d6e344
--- /dev/null
+++ b/tests/arch/arm/arm_switch/prj.conf
@@ -0,0 +1,5 @@
+CONFIG_ZTEST=y
+CONFIG_MULTITHREADING=n
+CONFIG_FPU=y
+CONFIG_MPU=n
+CONFIG_ARM_MPU=n
diff --git a/tests/arch/arm/arm_switch/src/main.c b/tests/arch/arm/arm_switch/src/main.c
new file mode 100644
index 0000000000000..4e656e2781ea8
--- /dev/null
+++ b/tests/arch/arm/arm_switch/src/main.c
@@ -0,0 +1,205 @@
+/* Copyright 2025 The ChromiumOS Authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <zephyr/kernel.h>
+#include <zephyr/linker/linker-defs.h>
+#include <zephyr/ztest.h>
+#include <kernel_arch_func.h>
+
+#ifdef CONFIG_MULTITHREADING
+#error "Test does not work with CONFIG_MULTITHREADING=n"
+#endif
+
+char stack[4096];
+
+void *main_sh, *my_sh;
+
+/* Aforementioned dirty trick: this value becomes the "next switch
+ * handle" examined by the interrupt exit code in lieu of whatever the
+ * scheduler would return.
+ */
+void *next_sh;
+
+int sum;
+
+bool my_fn_ran;
+
+extern void *arm_m_last_switch_handle;
+
+void *z_get_next_switch_handle(void *interrupted)
+{
+	return next_sh;
+}
+
+void my_fn(void *a, void *b, void *c, void *d)
+{
+	void *psplim;
+
+	__asm__ volatile("mrs %0, psplim" : "=r"(psplim));
+
+	printk("%s: PSPLIM = %p\n", __func__, psplim);
+
+	zassert_equal((int)a, 0);
+	zassert_equal((int)b, 1);
+	zassert_equal((int)c, 2);
+
+	register int A = 11;
+	register int B = 12;
+	register int C = 13;
+	register int D = 14;
+	register int E = 15;
+
+#ifdef CONFIG_CPU_HAS_FPU
+	register float FA = 11.0f;
+	register float FB = 12.0f;
+	register float FC = 13.0f;
+#endif
+
+	for (int n = 0; /**/; n++) {
+		printk("%s:%d iter %d\n", __func__, __LINE__, n);
+
+		if (arm_m_last_switch_handle) {
+			printk("Using exception handle @ %p\n", arm_m_last_switch_handle);
+			main_sh = arm_m_last_switch_handle;
+			arm_m_last_switch_handle = NULL;
+		}
+
+		my_fn_ran = true;
+
+		arm_m_switch(main_sh, &my_sh);
+
+		zassert_equal(A, 11);
+		zassert_equal(B, 12);
+		zassert_equal(C, 13);
+		zassert_equal(D, 14);
+		zassert_equal(E, 15);
+#ifdef CONFIG_CPU_HAS_FPU
+		zassert_equal(FA, 11.0f);
+		zassert_equal(FB, 12.0f);
+		zassert_equal(FC, 13.0f);
+#endif
+	}
+}
+
+/* Nothing in particular to do here except exercise the interrupt exit hook */
+void my_svc(void)
+{
+	printk("%s:%d\n", __func__, __LINE__);
+	arm_m_exc_tail();
+
+	/* Validate that the tail hook doesn't need to be last */
+	printk("   arm_m_exc_tail() has been called\n");
+}
+
+ZTEST(arm_m_switch, test_smoke)
+{
+	void *psplim;
+
+	__asm__ volatile("mrs %0, psplim" : "=r"(psplim));
+	printk("In main, PSPLIM = %p\n", psplim);
+
+	/* "register" variables don't strictly force the compiler not
+	 * to spill them, but inspecting the generated code shows it's
+	 * leaving them in place across the switch.
+	 */
+	register int A = 1;
+	register int B = 2;
+	register int C = 3;
+	register int D = 4;
+	register int E = 5;
+
+#ifdef CONFIG_CPU_HAS_FPU
+	/* Prime all the FPU registers with something I can recognize in a debugger */
+	uint32_t sregs[32];
+
+	for (int i = 0; i < 32; i++) {
+		sregs[i] = 0x3f800000 + i;
+	}
+	__asm__ volatile("vldm %0, {s0-s31}" ::"r"(sregs));
+
+	register float FA = 1.0f;
+	register float FB = 2.0f;
+	register float FC = 3.0f;
+#endif
+
+	sum += A + B + C + D + E;
+
+	/* Hit an interrupt and make sure CPU state doesn't get messed up */
+	printk("Invoking SVC\n");
+	__asm__ volatile("svc 0");
+	printk("...back\n");
+
+	zassert_equal(A, 1);
+	zassert_equal(B, 2);
+	zassert_equal(C, 3);
+	zassert_equal(D, 4);
+	zassert_equal(E, 5);
+#ifdef CONFIG_CPU_HAS_FPU
+	zassert_equal(FA, 1.0f);
+	zassert_equal(FB, 2.0f);
+	zassert_equal(FC, 3.0f);
+#endif
+
+	/* Now likewise switch to and from a foreign stack and check */
+	my_sh = arm_m_new_stack(stack, sizeof(stack), my_fn, (void *)0, (void *)1, (void *)2, NULL);
+
+	int cycles = 16;
+
+	for (int n = 0; n < cycles; n++) {
+		printk("main() switching to my_fn() (iter %d)...\n", n);
+		my_fn_ran = false;
+		arm_m_switch(my_sh, &main_sh);
+		printk("...and back\n");
+
+		zassert_true(my_fn_ran);
+		zassert_equal(A, 1);
+		zassert_equal(B, 2);
+		zassert_equal(C, 3);
+		zassert_equal(D, 4);
+		zassert_equal(E, 5);
+#ifdef CONFIG_CPU_HAS_FPU
+		zassert_equal(FA, 1.0f);
+		zassert_equal(FB, 2.0f);
+		zassert_equal(FC, 3.0f);
+#endif
+	}
+}
+
+/* Makes a copy of the vector table in writable RAM (it's generally in
+ * a ROM section), redirects it, and hooks the SVC interrupt with our
+ * own code above so we can catch direct interrupts.
+ */
+void *vector_hijack(void)
+{
+	static uint32_t __aligned(1024) vectors[256];
+	uint32_t *vtor_p = (void *)0xe000ed08;
+	uint32_t *vtor = (void *)*vtor_p;
+
+	printk("VTOR @%p\n", vtor);
+	if (vtor == NULL) {
+		/* mps2/an385 doesn't set this up, don't know why */
+		printk("VTOR not set up by SOC, skipping case\n");
+		ztest_test_skip();
+		return NULL;
+	}
+
+	/* Vector count: _vector_start/end set by the linker. */
+	int nv = (&_vector_end[0] - &_vector_start[0]) / sizeof(uint32_t);
+
+	for (int i = 0; i < nv; i++) {
+		vectors[i] = vtor[i];
+	}
+	*vtor_p = (uint32_t)&vectors[0];
+	vtor = (void *)*vtor_p;
+	printk("VTOR now @%p\n", vtor);
+
+	/* And hook the SVC call with our own function above, allowing
+	 * us direct access to interrupt entry
+	 */
+	vtor[11] = (int)my_svc;
+	printk("vtor[11] == %p (my_svc == %p)\n", (void *)vtor[11], my_svc);
+
+	return NULL;
+}
+
+ZTEST_SUITE(arm_m_switch, NULL, vector_hijack, NULL, NULL, NULL);
diff --git a/tests/arch/arm/arm_switch/testcase.yaml b/tests/arch/arm/arm_switch/testcase.yaml
new file mode 100644
index 0000000000000..12d2439155657
--- /dev/null
+++ b/tests/arch/arm/arm_switch/testcase.yaml
@@ -0,0 +1,6 @@
+tests:
+  arch.arm.switch:
+    arch_allow: arm
+    filter: CONFIG_CPU_CORTEX_M and CONFIG_USE_SWITCH
+    tags:
+      - arm

From c16be472815ce2b043d2b090180611e1c7b631ce Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Tue, 4 Feb 2025 10:02:26 -0800
Subject: [PATCH 06/27] arch/arm: Platform integration for new Cortex M
 arch_switch()

Integrate the new context layer, allowing it to be selected via the
pre-existing CONFIG_USE_SWITCH.  Not a lot of changes, but notable
ones:

+ There was code in the MPU layer to adjust PSP on exception exit at a
  stack overflow so that it remained inside the defined stack bounds.
  With the new context layer though, exception exit will rewrite the
  stack frame in a larger format, and needs PSP to be adjusted to make
  room.

+ There was no such treatment in the PSPLIM case (the hardware prents
  the SP from going that low), so I had to add similar code to
  validate PSP at exit from fault handling.

+ The various return paths for fault/svc assembly handlers need to
  call out to the switch code to do the needed scheduler work.  Really
  almost all of these can be replaced with C now, only userspace
  syscall entry (which has to "return" into the privileged stack)
  needs special treatment.

+ There is a gcc bug that prevents the arch_switch() inline assembly
  from building when frame pointers are enabled (which they almost
  never are on ARM): it disallows you from touching r7 (the thumb
  frame pointer) entirely.  But it's a context switch, we need to!
  Worked around by enforcing -fomit-frame-pointer even in the two
  scheduler files that can swap when NO_OPTIMIZATIONS=y.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/Kconfig                        |  4 +--
 arch/arm/core/cortex_m/CMakeLists.txt        |  3 +-
 arch/arm/core/cortex_m/Kconfig               |  4 +++
 arch/arm/core/cortex_m/fault.c               | 25 ++++++++++++++
 arch/arm/core/cortex_m/fault_s.S             |  2 +-
 arch/arm/core/cortex_m/swap_helper.S         | 17 +++++++++-
 arch/arm/core/cortex_m/thread.c              | 34 ++++++++++++++------
 arch/arm/core/cortex_m/vector_table.S        |  2 +-
 arch/arm/include/cortex_m/kernel_arch_func.h |  4 +++
 include/zephyr/arch/arm/cortex_m/exception.h | 10 ++++++
 include/zephyr/arch/arm/irq.h                | 13 ++++++--
 kernel/CMakeLists.txt                        | 19 +++++++++++
 tests/arch/arm/arm_interrupt/testcase.yaml   |  2 +-
 13 files changed, 119 insertions(+), 20 deletions(-)

diff --git a/arch/arm/core/Kconfig b/arch/arm/core/Kconfig
index 5505eee4e19b4..125e8e4bf954f 100644
--- a/arch/arm/core/Kconfig
+++ b/arch/arm/core/Kconfig
@@ -6,11 +6,11 @@
 config CPU_CORTEX_M
 	bool
 	select CPU_CORTEX
-	select ARCH_HAS_CUSTOM_SWAP_TO_MAIN
+	select ARCH_HAS_CUSTOM_SWAP_TO_MAIN if !USE_SWITCH
 	select HAS_CMSIS_CORE
 	select HAS_FLASH_LOAD_OFFSET
 	select ARCH_HAS_SINGLE_THREAD_SUPPORT
-	select ARCH_HAS_THREAD_ABORT
+	select ARCH_HAS_THREAD_ABORT if  !USE_SWITCH
 	select ARCH_HAS_TRUSTED_EXECUTION if ARM_TRUSTZONE_M
 	select ARCH_HAS_STACK_PROTECTION if (ARM_MPU && !ARMV6_M_ARMV8_M_BASELINE) || CPU_CORTEX_M_HAS_SPLIM
 	select ARCH_HAS_USERSPACE if ARM_MPU
diff --git a/arch/arm/core/cortex_m/CMakeLists.txt b/arch/arm/core/cortex_m/CMakeLists.txt
index bfa3ff8887381..0e5aa94676c26 100644
--- a/arch/arm/core/cortex_m/CMakeLists.txt
+++ b/arch/arm/core/cortex_m/CMakeLists.txt
@@ -20,13 +20,11 @@ elseif(CONFIG_ARMV8_1_M_PACBTI_NONE)
 endif()
 
 zephyr_library_sources(
-  exc_exit.c
   fault.c
   fault_s.S
   fpu.c
   reset.S
   scb.c
-  thread_abort.c
   vector_table.S
   swap_helper.S
   irq_manage.c
@@ -36,6 +34,7 @@ zephyr_library_sources(
   )
 
 zephyr_library_sources_ifdef(CONFIG_USE_SWITCH arm-m-switch.c)
+zephyr_library_sources_ifndef(CONFIG_USE_SWITCH exc_exit.c thread_abort.c)
 
 zephyr_library_sources_ifndef(CONFIG_ARM_CUSTOM_INTERRUPT_CONTROLLER irq_init.c)
 zephyr_library_sources_ifdef(CONFIG_GEN_SW_ISR_TABLE isr_wrapper.c)
diff --git a/arch/arm/core/cortex_m/Kconfig b/arch/arm/core/cortex_m/Kconfig
index cb7f134598f3a..031bd7180f41f 100644
--- a/arch/arm/core/cortex_m/Kconfig
+++ b/arch/arm/core/cortex_m/Kconfig
@@ -106,6 +106,9 @@ config CPU_CORTEX_M7
 
 if CPU_CORTEX_M
 
+config USE_SWITCH
+	default y if USE_SWITCH_SUPPORTED && !ARM_NONSECURE_FIRMWARE
+
 config CPU_CORTEX_M_HAS_SYSTICK
 	bool
 	help
@@ -226,6 +229,7 @@ config ARMV7_M_ARMV8_M_MAINLINE
 	select CPU_CORTEX_M_HAS_VTOR
 	select CPU_CORTEX_M_HAS_PROGRAMMABLE_FAULT_PRIOS
 	select CPU_CORTEX_M_HAS_SYSTICK
+	select USE_SWITCH_SUPPORTED
 	help
 	  This option signifies the use of an ARMv7-M processor
 	  implementation, or the use of a backwards-compatible
diff --git a/arch/arm/core/cortex_m/fault.c b/arch/arm/core/cortex_m/fault.c
index 204b403535cd0..a7cd653381614 100644
--- a/arch/arm/core/cortex_m/fault.c
+++ b/arch/arm/core/cortex_m/fault.c
@@ -1081,6 +1081,31 @@ void z_arm_fault(uint32_t msp, uint32_t psp, uint32_t exc_return, _callee_saved_
 	}
 
 	z_arm_fatal_error(reason, &esf_copy);
+
+#if defined(CONFIG_USE_SWITCH)
+	/* The arch_switch implementation can expand the size of the
+	 * frame when it saves the outgoing context (to the faulting
+	 * thread), so make sure it has room.  The resulting stack
+	 * will obviously be corrupt, but it won't overflow the
+	 * guarded region.
+	 */
+#if defined(CONFIG_BUILTIN_STACK_GUARD)
+	uint32_t psplim, psp2;
+
+	__asm__ volatile("mrs %0, psplim" : "=r"(psplim));
+	psp2 = MAX(psplim + arm_m_switch_stack_buffer, psp);
+	__asm__ volatile("msr psp, %0" ::"r"(psp2));
+#endif
+
+	/* Prepare interrupt exit for context switch, but only if this
+	 * is the lowest level interrupt (if we're nested, that was a
+	 * fault that happened in the ISR and shouldn't affect thread
+	 * state).
+	 */
+	if ((exc_return & 0xf000000f) == 0xf000000d) {
+		arm_m_exc_tail();
+	}
+#endif
 }
 
 /**
diff --git a/arch/arm/core/cortex_m/fault_s.S b/arch/arm/core/cortex_m/fault_s.S
index 43d8eaa34dd56..8b2e30d71af97 100644
--- a/arch/arm/core/cortex_m/fault_s.S
+++ b/arch/arm/core/cortex_m/fault_s.S
@@ -80,6 +80,7 @@ SECTION_SUBSEC_FUNC(TEXT,__fault,z_arm_exc_spurious)
 	mrs r0, MSP
 	mrs r1, PSP
 	push {r0, lr}
+
 #if defined(CONFIG_EXTRA_EXCEPTION_INFO)
 	/* Build _callee_saved_t. To match the struct
 	 * definition we push the psp & then r11-r4
@@ -109,5 +110,4 @@ SECTION_SUBSEC_FUNC(TEXT,__fault,z_arm_exc_spurious)
 	add sp, #40
 #endif
 	pop {r0, pc}
-
 	.end
diff --git a/arch/arm/core/cortex_m/swap_helper.S b/arch/arm/core/cortex_m/swap_helper.S
index 40f00fc005629..0dc4692bc1c64 100644
--- a/arch/arm/core/cortex_m/swap_helper.S
+++ b/arch/arm/core/cortex_m/swap_helper.S
@@ -432,9 +432,19 @@ SECTION_FUNC(TEXT, z_arm_svc)
     pop {r0, lr}
 #endif
 
-    /* exception return is done in z_arm_int_exit() */
+#ifdef CONFIG_USE_SWITCH
+    /* FIXME: this uses the pre-hook LR, but we want to refetch the
+     * one in the stack frame, otherwise irq_offload() won't context
+     * switch correctly (which... isn't tested?)
+     */
+    push {r0, lr}
+    bl arm_m_legacy_exit
+    pop {r0, lr}
+    bx lr
+#else
     ldr r0, =z_arm_int_exit
     bx r0
+#endif
 
 #endif
 
@@ -464,6 +474,11 @@ SECTION_FUNC(TEXT, z_arm_svc)
     add sp, #40
 #endif /* CONFIG_ARMV7_M_ARMV8_M_MAINLINE */
 #endif /* CONFIG_EXTRA_EXCEPTION_INFO */
+
+#ifdef CONFIG_USE_SWITCH
+    bl arm_m_legacy_exit
+#endif
+
     pop {r0, pc}
 
 #if defined(CONFIG_USERSPACE)
diff --git a/arch/arm/core/cortex_m/thread.c b/arch/arm/core/cortex_m/thread.c
index a1e281af4d5a5..51f3150cdf42a 100644
--- a/arch/arm/core/cortex_m/thread.c
+++ b/arch/arm/core/cortex_m/thread.c
@@ -60,8 +60,6 @@ K_THREAD_STACK_DECLARE(z_main_stack, CONFIG_MAIN_STACK_SIZE);
 void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, char *stack_ptr,
 		     k_thread_entry_t entry, void *p1, void *p2, void *p3)
 {
-	struct __basic_sf *iframe;
-
 #ifdef CONFIG_MPU_STACK_GUARD
 #if defined(CONFIG_USERSPACE)
 	if (z_stack_is_user_capable(stack)) {
@@ -85,17 +83,22 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, char *sta
 #endif /* FP_GUARD_EXTRA_SIZE */
 #endif /* CONFIG_MPU_STACK_GUARD */
 
-	iframe = Z_STACK_PTR_TO_FRAME(struct __basic_sf, stack_ptr);
+	void *entry_wrapper = z_thread_entry;
+
 #if defined(CONFIG_USERSPACE)
 	if ((thread->base.user_options & K_USER) != 0) {
-		iframe->pc = (uint32_t)arch_user_mode_enter;
-	} else {
-		iframe->pc = (uint32_t)z_thread_entry;
+		entry_wrapper = (void *)arch_user_mode_enter;
 	}
-#else
-	iframe->pc = (uint32_t)z_thread_entry;
 #endif
 
+#ifdef CONFIG_USE_SWITCH
+	thread->switch_handle = arm_m_new_stack((char *)stack, stack_ptr - (char *)stack,
+						entry_wrapper, entry, p1, p2, p3);
+#else
+	struct __basic_sf *iframe = Z_STACK_PTR_TO_FRAME(struct __basic_sf, stack_ptr);
+
+	iframe->pc = (uint32_t)entry_wrapper;
+
 	/* force ARM mode by clearing LSB of address */
 	iframe->pc &= 0xfffffffe;
 	iframe->a1 = (uint32_t)entry;
@@ -106,6 +109,8 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, char *sta
 	iframe->xpsr = 0x01000000UL; /* clear all, thumb bit is 1, even if RO */
 
 	thread->callee_saved.psp = (uint32_t)iframe;
+#endif
+
 	thread->arch.basepri = 0;
 
 #if defined(CONFIG_ARM_STORE_EXC_RETURN) || defined(CONFIG_USERSPACE)
@@ -355,7 +360,7 @@ void configure_builtin_stack_guard(struct k_thread *thread)
  * @return The lowest allowed stack frame pointer, if error is a
  *         thread stack corruption, otherwise return 0.
  */
-uint32_t z_check_thread_stack_fail(const uint32_t fault_addr, const uint32_t psp)
+static uint32_t min_stack(const uint32_t fault_addr, const uint32_t psp)
 {
 #if defined(CONFIG_MULTITHREADING)
 	const struct k_thread *thread = _current;
@@ -418,6 +423,17 @@ uint32_t z_check_thread_stack_fail(const uint32_t fault_addr, const uint32_t psp
 
 	return 0;
 }
+
+uint32_t z_check_thread_stack_fail(const uint32_t fault_addr, const uint32_t psp)
+{
+	uint32_t sp = min_stack(fault_addr, psp);
+
+	if (sp != 0 && IS_ENABLED(CONFIG_USE_SWITCH)) {
+		sp += arm_m_switch_stack_buffer;
+	}
+	return sp;
+}
+
 #endif /* CONFIG_MPU_STACK_GUARD || CONFIG_USERSPACE */
 
 #if defined(CONFIG_FPU) && defined(CONFIG_FPU_SHARING)
diff --git a/arch/arm/core/cortex_m/vector_table.S b/arch/arm/core/cortex_m/vector_table.S
index 45e10ea803499..226cd636caf5b 100644
--- a/arch/arm/core/cortex_m/vector_table.S
+++ b/arch/arm/core/cortex_m/vector_table.S
@@ -74,7 +74,7 @@ SECTION_SUBSEC_FUNC(exc_vector_table,_vector_table_section,_vector_table)
 #error Unknown ARM architecture
 #endif /* CONFIG_ARMV6_M_ARMV8_M_BASELINE */
     .word 0
-#if defined(CONFIG_MULTITHREADING)
+#if defined(CONFIG_MULTITHREADING) && !defined(USE_SWITCH)
     .word z_arm_pendsv
 #else
     .word z_arm_exc_spurious
diff --git a/arch/arm/include/cortex_m/kernel_arch_func.h b/arch/arm/include/cortex_m/kernel_arch_func.h
index 5abe5870762cf..befc8ae20071c 100644
--- a/arch/arm/include/cortex_m/kernel_arch_func.h
+++ b/arch/arm/include/cortex_m/kernel_arch_func.h
@@ -27,6 +27,8 @@ extern "C" {
 #endif
 
 #ifndef _ASMLANGUAGE
+#include <zephyr/arch/arm/arm-m-switch.h>
+
 extern void z_arm_fault_init(void);
 extern void z_arm_cpu_idle_init(void);
 #ifdef CONFIG_ARM_MPU
@@ -61,10 +63,12 @@ static ALWAYS_INLINE void arch_kernel_init(void)
 #endif /* CONFIG_SOC_PER_CORE_INIT_HOOK */
 }
 
+#ifndef CONFIG_USE_SWITCH
 static ALWAYS_INLINE void arch_thread_return_value_set(struct k_thread *thread, unsigned int value)
 {
 	thread->arch.swap_return_value = value;
 }
+#endif
 
 #if !defined(CONFIG_MULTITHREADING)
 extern FUNC_NORETURN void z_arm_switch_to_main_no_multithreading(k_thread_entry_t main_func,
diff --git a/include/zephyr/arch/arm/cortex_m/exception.h b/include/zephyr/arch/arm/cortex_m/exception.h
index d675efa1549d0..2b90e797b2799 100644
--- a/include/zephyr/arch/arm/cortex_m/exception.h
+++ b/include/zephyr/arch/arm/cortex_m/exception.h
@@ -15,6 +15,9 @@
 #include <zephyr/devicetree.h>
 
 #include <zephyr/arch/arm/cortex_m/nvic.h>
+#ifndef _ASMLANGUAGE
+#include <zephyr/arch/arm/arm-m-switch.h>
+#endif
 
 /* for assembler, only works with constants */
 #define Z_EXC_PRIO(pri) (((pri) << (8 - NUM_IRQ_PRIO_BITS)) & 0xff)
@@ -125,7 +128,14 @@ struct arch_esf {
 
 extern uint32_t z_arm_coredump_fault_sp;
 
+#ifdef CONFIG_USE_SWITCH
+static inline void z_arm_exc_exit(void)
+{
+	arm_m_exc_tail();
+}
+#else
 extern void z_arm_exc_exit(void);
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/include/zephyr/arch/arm/irq.h b/include/zephyr/arch/arm/irq.h
index 40d467aa3d318..a63098d4a0138 100644
--- a/include/zephyr/arch/arm/irq.h
+++ b/include/zephyr/arch/arm/irq.h
@@ -18,6 +18,9 @@
 
 #include <zephyr/sw_isr_table.h>
 #include <stdbool.h>
+#if !defined(_ASMLANGUAGE) && defined(CONFIG_CPU_CORTEX_M)
+#include <zephyr/arch/arm/arm-m-switch.h>
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -74,7 +77,14 @@ void z_soc_irq_eoi(unsigned int irq);
 
 #endif
 
+#if defined(CONFIG_CPU_CORTEX_M) && defined(CONFIG_USE_SWITCH)
+static inline void z_arm_int_exit(void)
+{
+	arm_m_exc_tail();
+}
+#else
 extern void z_arm_int_exit(void);
+#endif
 
 extern void z_arm_interrupt_init(void);
 
@@ -144,9 +154,6 @@ extern void _arch_isr_direct_pm(void);
 #define ARCH_ISR_DIRECT_HEADER() arch_isr_direct_header()
 #define ARCH_ISR_DIRECT_FOOTER(swap) arch_isr_direct_footer(swap)
 
-/* arch/arm/core/exc_exit.S */
-extern void z_arm_int_exit(void);
-
 #ifdef CONFIG_TRACING_ISR
 extern void sys_trace_isr_enter(void);
 extern void sys_trace_isr_exit(void);
diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index 075e091cbb62d..d02cafcf698d3 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -61,6 +61,25 @@ list(APPEND kernel_files
   version.c
   )
 
+# Toolchain workaround: on ARM Thumb, gcc will normally enable
+# -fomit-frame-pointer at all optimization levels except -O0.  When
+# frame pointers ARE enabled, gcc will also infuriatingly disallow a
+# clobber of the r7 frame pointer register in inline assembly
+# (vs. just spilling filling as needed).  Our optimized arch_switch
+# requires the ability to clobber this (callee-saved) register.  Turn
+# the flag back on for the two files that need it.  (The error
+# behavior only gets modified at the level of a whole file, I tried
+# #pragmas and attributes to change the code generation for the
+# specific functions without success).
+if(CONFIG_CPU_CORTEX_M)
+  if(CONFIG_NO_OPTIMIZATIONS)
+    if(${COMPILER} STREQUAL "gcc")
+      set_source_files_properties(init.c PROPERTIES COMPILE_OPTIONS -fomit-frame-pointer)
+      set_source_files_properties(sched.c PROPERTIES COMPILE_OPTIONS -fomit-frame-pointer)
+    endif()
+  endif()
+endif()
+
 if(CONFIG_SCHED_CPU_MASK)
 list(APPEND kernel_files
   cpu_mask.c
diff --git a/tests/arch/arm/arm_interrupt/testcase.yaml b/tests/arch/arm/arm_interrupt/testcase.yaml
index df47b02fccd93..09241a3664b0a 100644
--- a/tests/arch/arm/arm_interrupt/testcase.yaml
+++ b/tests/arch/arm/arm_interrupt/testcase.yaml
@@ -18,6 +18,6 @@ tests:
       - CONFIG_MAIN_STACK_SIZE=2048
       - CONFIG_ZTEST_STACK_SIZE=1280
   arch.arm.interrupt.extra_exception_info:
-    filter: not CONFIG_TRUSTED_EXECUTION_NONSECURE
+    filter: ARCH_HAS_EXTRA_EXCEPTION_INFO and not CONFIG_TRUSTED_EXECUTION_NONSECURE
     extra_configs:
       - CONFIG_EXTRA_EXCEPTION_INFO=y

From 3e38eef92c88697fad80388c6c3c6b36625f7b5a Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Fri, 1 Aug 2025 09:16:14 -0700
Subject: [PATCH 07/27] arch/arm: Handle multi-cycle instruction in USE_SWITCH

ARM Cortex M has what amounts to a design bug.  The architecture
inherits several unpipelined/microcoded "ICI/IT" instruction forms
that take many cycles to complete (LDM/STM and the Thumb "IT"
conditional frame are the big ones).  But out of a desire to minimize
interrupt latency, the CPU is allowed to halt and resume these
instructions mid-flight while they are partially completed.  The
relevant bits of state are stored in the EPSR fields of the xPSR
register (see ARMv7-M manual B1.4.2).  But (and this is the design
bug) those bits CANNOT BE WRITTEN BY SOFTWARE.  They can only be
modified by exception return.

This means that if a Zephyr thread takes an interrupt
mid-ICI/IT-instruction, then switches to another thread on exit, and
then that thread is resumed by a cooperative switch and not an
interrupt, the instruction will lose the state and restart from
scratch.  For LDM/STM that's generally idempotent for memory (but not
MMIO!), but for IT that means that the restart will re-execute
arbitrary instructions that may not be idempotent (e.g. "addeq r0, r0,

The fix is to check for this condition (which is very rare) on
interrupt exit when we are switching, and if we discover we've
interrupted such an instruction we swap the return address with a
trampoline that uses a UDF instruction to immediately trap to the
undefined instruction handler, which then recognizes the fixup address
as special and immediately returns back into the thread with the
correct EPSR value and resume PC (which have been stashed in the
thread struct).  The overhead for the normal case is just a few cycles
for the test.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/arm-m-switch.c  | 94 ++++++++++++++++++++++++++
 arch/arm/core/cortex_m/fault.c         |  7 ++
 arch/arm/core/cortex_m/thread.c        |  1 +
 include/zephyr/arch/arm/arm-m-switch.h |  4 ++
 include/zephyr/arch/arm/thread.h       |  5 ++
 5 files changed, 111 insertions(+)

diff --git a/arch/arm/core/cortex_m/arm-m-switch.c b/arch/arm/core/cortex_m/arm-m-switch.c
index 088b8fb0a7c21..778416b032a85 100644
--- a/arch/arm/core/cortex_m/arm-m-switch.c
+++ b/arch/arm/core/cortex_m/arm-m-switch.c
@@ -157,6 +157,18 @@ uint32_t arm_m_switch_control;
 } while (false)
 /* clang-format on */
 
+/* The arch/cpu/toolchain are horrifyingly inconsistent with how the
+ * thumb bit is treated in runtime addresses.  The PC target for a B
+ * instruction must have it set.  The PC pushed from an exception has
+ * is unset.  The linker puts functions at even addresses, obviously,
+ * but the symbol address exposed at runtime has it set.  Exception
+ * return ignores it.  Use this to avoid insanity.
+ */
+static bool pc_match(uint32_t pc, void *addr)
+{
+	return ((pc ^ (uint32_t) addr) & ~1) == 0;
+}
+
 /* Reports if the passed return address is a valid EXC_RETURN (high
  * four bits set) that will restore to the PSP running in thread mode
  * (low four bits == 0xd).  That is an interrupted Zephyr thread
@@ -177,6 +189,72 @@ static bool fpu_state_pushed(uint32_t lr)
 	return IS_ENABLED(CONFIG_CPU_HAS_FPU) ? !(lr & 0x10) : false;
 }
 
+/* ICI/IT instruction fault workarounds
+ *
+ * ARM Cortex M has what amounts to a design bug.  The architecture
+ * inherits several unpipelined/microcoded "ICI/IT" instruction forms
+ * that take many cycles to complete (LDM/STM and the Thumb "IT"
+ * conditional frame are the big ones).  But out of a desire to
+ * minimize interrupt latency, the CPU is allowed to halt and resume
+ * these instructions mid-flight while they are partially completed.
+ * The relevant bits of state are stored in the EPSR fields of the
+ * xPSR register (see ARMv7-M manual B1.4.2).  But (and this is the
+ * design bug) those bits CANNOT BE WRITTEN BY SOFTWARE.  They can
+ * only be modified by exception return.
+ *
+ * This means that if a Zephyr thread takes an interrupt
+ * mid-ICI/IT-instruction, then switches to another thread on exit,
+ * and then that thread is resumed by a cooperative switch and not an
+ * interrupt, the instruction will lose the state and restart from
+ * scratch.  For LDM/STM that's generally idempotent for memory (but
+ * not MMIO!), but for IT that means that the restart will re-execute
+ * arbitrary instructions that may not be idempotent (e.g. "addeq r0,
+ * r0, #1" can't be done twice, because you would add two to r0!)
+ *
+ * The fix is to check for this condition (which is very rare) on
+ * interrupt exit when we are switching, and if we discover we've
+ * interrupted such an instruction we swap the return address with a
+ * trampoline that uses a UDF instruction to immediately trap to the
+ * undefined instruction handler, which then recognizes the fixup
+ * address as special and immediately returns back into the thread
+ * with the correct EPSR value and resume PC (which have been stashed
+ * in the thread struct).  The overhead for the normal case is just a
+ * few cycles for the test.
+ */
+__asm__(".globl arm_m_iciit_stub;"
+	"arm_m_iciit_stub:;"
+	"  udf 0;");
+
+/* Called out of interrupt entry to test for an interrupted instruction */
+static void iciit_fixup(struct k_thread *th, struct hw_frame_base *hw, uint32_t xpsr)
+{
+#ifdef CONFIG_MULTITHREADING
+	if ((xpsr & 0x0600fc00) != 0) {
+		/* Stash original return address, replace with hook */
+		th->arch.iciit_pc = hw->pc;
+		th->arch.iciit_apsr = hw->apsr;
+		hw->pc = (uint32_t) arm_m_iciit_stub;
+	}
+#endif
+}
+
+/* Called out of fault handler from the UDF after an arch_switch() */
+bool arm_m_iciit_check(uint32_t msp, uint32_t psp, uint32_t lr)
+{
+	struct hw_frame_base *f = (void *)psp;
+
+	/* Look for undefined instruction faults from our stub */
+	if (pc_match(f->pc, arm_m_iciit_stub)) {
+		if (is_thread_return(lr)) {
+			f->pc = _current->arch.iciit_pc;
+			f->apsr = _current->arch.iciit_apsr;
+			_current->arch.iciit_pc = 0;
+			return true;
+		}
+	}
+	return false;
+}
+
 #ifdef CONFIG_BUILTIN_STACK_GUARD
 #define PSPLIM(f) ((f)->z.u.sw.psplim)
 #else
@@ -259,6 +337,11 @@ static void *arm_m_cpu_to_switch(struct k_thread *th, void *sp, bool fpu)
 				 "msr control, %0;" ::"r"(dummy));
 	}
 
+	/* Detects interrupted ICI/IT instructions and rigs up thread
+	 * to trap the next time it runs
+	 */
+	iciit_fixup(th, base, base->apsr);
+
 	if (IS_ENABLED(CONFIG_FPU) && fpu) {
 		fpscr = CONTAINER_OF(sp, struct hw_frame_fpu, base)->fpscr;
 	}
@@ -395,6 +478,17 @@ bool arm_m_must_switch(uint32_t lr)
 	next = arm_m_switch_to_cpu(next);
 	__asm__ volatile("msr psp, %0" ::"r"(next));
 
+	/* Undo a UDF fixup applied at interrupt time, no need: we're
+	 * restoring EPSR via interrupt.
+	 */
+	if (_current->arch.iciit_pc) {
+		struct hw_frame_base *n = next;
+
+		n->pc = _current->arch.iciit_pc;
+		n->apsr = _current->arch.iciit_apsr;
+		_current->arch.iciit_pc = 0;
+	}
+
 #if !defined(CONFIG_MULTITHREADING)
 	arm_m_last_switch_handle = last;
 #elif defined(CONFIG_USE_SWITCH)
diff --git a/arch/arm/core/cortex_m/fault.c b/arch/arm/core/cortex_m/fault.c
index a7cd653381614..f6f3a68719b4c 100644
--- a/arch/arm/core/cortex_m/fault.c
+++ b/arch/arm/core/cortex_m/fault.c
@@ -1029,6 +1029,13 @@ void z_arm_fault(uint32_t msp, uint32_t psp, uint32_t exc_return, _callee_saved_
 	bool recoverable, nested_exc;
 	struct arch_esf *esf;
 
+#ifdef CONFIG_USE_SWITCH
+	/* Handle the stub fault to restore interrupted ICI/IT instructions */
+	if (arm_m_iciit_check(msp, psp, exc_return)) {
+		return;
+	}
+#endif
+
 	/* Create a stack-ed copy of the ESF to be used during
 	 * the fault handling process.
 	 */
diff --git a/arch/arm/core/cortex_m/thread.c b/arch/arm/core/cortex_m/thread.c
index 51f3150cdf42a..25606a6b919eb 100644
--- a/arch/arm/core/cortex_m/thread.c
+++ b/arch/arm/core/cortex_m/thread.c
@@ -94,6 +94,7 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, char *sta
 #ifdef CONFIG_USE_SWITCH
 	thread->switch_handle = arm_m_new_stack((char *)stack, stack_ptr - (char *)stack,
 						entry_wrapper, entry, p1, p2, p3);
+	thread->arch.iciit_pc = 0;
 #else
 	struct __basic_sf *iframe = Z_STACK_PTR_TO_FRAME(struct __basic_sf, stack_ptr);
 
diff --git a/include/zephyr/arch/arm/arm-m-switch.h b/include/zephyr/arch/arm/arm-m-switch.h
index 2f6de26b63f68..2187856dfcca9 100644
--- a/include/zephyr/arch/arm/arm-m-switch.h
+++ b/include/zephyr/arch/arm/arm-m-switch.h
@@ -40,6 +40,10 @@ bool arm_m_must_switch(void);
 
 void arm_m_exc_exit(void);
 
+bool arm_m_iciit_check(uint32_t msp, uint32_t psp, uint32_t lr);
+
+void arm_m_iciit_stub(void);
+
 extern uint32_t *arm_m_exc_lr_ptr;
 
 void z_arm_configure_dynamic_mpu_regions(struct k_thread *thread);
diff --git a/include/zephyr/arch/arm/thread.h b/include/zephyr/arch/arm/thread.h
index 139f606809f9d..3e73634d7b3fe 100644
--- a/include/zephyr/arch/arm/thread.h
+++ b/include/zephyr/arch/arm/thread.h
@@ -68,6 +68,11 @@ struct _thread_arch {
 	/* r0 in stack frame cannot be written to reliably */
 	uint32_t swap_return_value;
 
+#if defined(CONFIG_USE_SWITCH) && defined(CONFIG_CPU_CORTEX_M)
+	uint32_t iciit_pc;
+	uint32_t iciit_apsr;
+#endif
+
 #if defined(CONFIG_FPU) && defined(CONFIG_FPU_SHARING)
 	/*
 	 * No cooperative floating point register set structure exists for

From 89cd7fc12a851253df43e44fb6d44fb3f6c41c52 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 5 Feb 2025 10:06:13 -0800
Subject: [PATCH 08/27] kernel: Minor optimizations to z_swap()

Pick some low hanging fruit on non-SMP code paths:

+ The scheduler spinlock is always taken, but as we're already in an
  irqlocked state that's a noop.  But the optmizer can't tell, because
  arch_irq_lock() involves an asm block it can't see inside.  Elide
  the call when possible.

+ The z_swap_next_thread() function evaluates to just a single load of
  _kernel.ready_q.cache when !SMP, but wasn't being inlined because of
  function location.  Move that test up into do_swap() so it's always
  done correctly.

Signed-off-by: Andy Ross <andyross@google.com>
---
 kernel/include/kswap.h | 11 +++++++++--
 kernel/sched.c         |  4 ----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/kernel/include/kswap.h b/kernel/include/kswap.h
index cff3efab6e90e..64eca71baf1f6 100644
--- a/kernel/include/kswap.h
+++ b/kernel/include/kswap.h
@@ -114,11 +114,18 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
 	if (is_spinlock && lock != NULL && lock != &_sched_spinlock) {
 		k_spin_release(lock);
 	}
-	if (!is_spinlock || lock != &_sched_spinlock) {
-		(void) k_spin_lock(&_sched_spinlock);
+	if (IS_ENABLED(CONFIG_SMP) || IS_ENABLED(CONFIG_SPIN_VALIDATE)) {
+		/* Taking a nested uniprocessor lock in void context is a noop */
+		if (!is_spinlock || lock != &_sched_spinlock) {
+			(void)k_spin_lock(&_sched_spinlock);
+		}
 	}
 
+#ifdef CONFIG_SMP
 	new_thread = z_swap_next_thread();
+#else
+	new_thread = _kernel.ready_q.cache;
+#endif
 
 	if (new_thread != old_thread) {
 		z_sched_usage_switch(new_thread);
diff --git a/kernel/sched.c b/kernel/sched.c
index 7ed11942107c3..9a50452cf1f1a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -800,7 +800,6 @@ void k_sched_unlock(void)
 
 struct k_thread *z_swap_next_thread(void)
 {
-#ifdef CONFIG_SMP
 	struct k_thread *ret = next_up();
 
 	if (ret == _current) {
@@ -811,9 +810,6 @@ struct k_thread *z_swap_next_thread(void)
 		signal_pending_ipi();
 	}
 	return ret;
-#else
-	return _kernel.ready_q.cache;
-#endif /* CONFIG_SMP */
 }
 
 #ifdef CONFIG_USE_SWITCH

From 2a4bb5e1e9d367e2461225a5ea1d24ae71f18d03 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 5 Feb 2025 11:01:56 -0800
Subject: [PATCH 09/27] tests/benchmarks: Add a tiny microbenchmark for
 z_swap()

Benchmarking context switch in the absence of other polluting factors
is actually really hard.  I wanted a microbenchmark of just z_swap()
for tuning the ARM arch_switch() code, and nothing we have is very
useful.  This one works well, measuring ONLY the time taken for a
thread to enter z_swap(), switch to another thread already suspended
in z_swap(), and return to a timestamp on the other side.  It runs a
few thousand iterations and reporting average and standard deviation.

Signed-off-by: Andy Ross <andyross@google.com>
---
 tests/benchmarks/swap/CMakeLists.txt     |  12 ++
 tests/benchmarks/swap/prj.conf           |   9 ++
 tests/benchmarks/swap/src/main.c         | 146 +++++++++++++++++++++++
 tests/benchmarks/swap/src/time_arm_m.h   |  24 ++++
 tests/benchmarks/swap/src/time_generic.h |  16 +++
 5 files changed, 207 insertions(+)
 create mode 100644 tests/benchmarks/swap/CMakeLists.txt
 create mode 100644 tests/benchmarks/swap/prj.conf
 create mode 100644 tests/benchmarks/swap/src/main.c
 create mode 100644 tests/benchmarks/swap/src/time_arm_m.h
 create mode 100644 tests/benchmarks/swap/src/time_generic.h

diff --git a/tests/benchmarks/swap/CMakeLists.txt b/tests/benchmarks/swap/CMakeLists.txt
new file mode 100644
index 0000000000000..271fa88a90fd7
--- /dev/null
+++ b/tests/benchmarks/swap/CMakeLists.txt
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20.0)
+
+find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE})
+project(hello_world)
+
+target_sources(app PRIVATE src/main.c)
+
+# Needed to call z_swap()
+target_include_directories(app PRIVATE ${ZEPHYR_BASE}/kernel/include)
+target_include_directories(app PRIVATE ${ZEPHYR_BASE}/arch/${ARCH}/include)
diff --git a/tests/benchmarks/swap/prj.conf b/tests/benchmarks/swap/prj.conf
new file mode 100644
index 0000000000000..a4c53f742e6cc
--- /dev/null
+++ b/tests/benchmarks/swap/prj.conf
@@ -0,0 +1,9 @@
+CONFIG_SPIN_VALIDATE=n
+CONFIG_ASSERT=n
+CONFIG_MPU=n
+CONFIG_ARM_MPU=n
+CONFIG_FPU=n
+CONFIG_TIMESLICING=n
+CONFIG_HW_STACK_PROTECTION=n
+CONFIG_THREAD_LOCAL_STORAGE=n
+CONFIG_MINIMAL_LIBC=y
diff --git a/tests/benchmarks/swap/src/main.c b/tests/benchmarks/swap/src/main.c
new file mode 100644
index 0000000000000..1b744cbcfaad4
--- /dev/null
+++ b/tests/benchmarks/swap/src/main.c
@@ -0,0 +1,146 @@
+/* Copyright 2025 The ChromiumOS Authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* This app implements a simple microbenchmark of z_swap(), carefully
+ * tuned to measure only cooperative swapping performance and nothing
+ * else.  Build it and run.  No special usage needed.
+ *
+ * Subtle sequencing, see comments below.  This runs without a timer
+ * driver (and in fact disables ARM SysTick so it can use it
+ * directly), controlling execution order via scheduler priority only.
+ */
+
+#include <math.h>
+#include <zephyr/kernel.h>
+#include <kswap.h>
+
+/* Use a microtuned platform timing hook if available, otherwise k_cycle_get_32(). */
+#ifdef CONFIG_CPU_CORTEX_M
+#include "time_arm_m.h"
+#else
+#include "time_generic.h"
+#endif
+
+/* Check config for obvious mistakes. */
+#ifdef CONFIG_ASSERT
+#warning "This is a performance benchmark, debug features should not normally be enabled"
+#define ORDER_CHECK() printk("%s:%d\n", __func__, __LINE__)
+#else
+#define ORDER_CHECK()
+#endif
+
+/* This ends up measuring a bunch of stuff not involved in arch code */
+#if defined(CONFIG_TIMESLICING)
+#warning "Timeslicing can pollute the microbenchmark"
+#endif
+
+/* And these absolutely explode the numbers, making the arch layer just noise */
+#if defined(CONFIG_MPU) || defined(CONFIG_MMU)
+#error "Don't enable memory management hardware in a microbenchmark!"
+#endif
+
+#define HI_PRIO   0
+#define LO_PRIO   1
+#define MAIN_PRIO 2
+#define DONE_PRIO 3
+
+void thread_fn(void *t0_arg, void *t1_arg, void *c);
+
+int swap_count;
+
+/* swap enter/exit times for each thread */
+int t0_0, t0_1, t1_0, t1_1;
+
+uint32_t samples[8 * 1024];
+
+struct k_spinlock lock;
+
+K_SEM_DEFINE(done_sem, 0, 999);
+K_THREAD_DEFINE(thread0, 1024, thread_fn, &t0_0, &t0_1, NULL, HI_PRIO, 0, 0);
+K_THREAD_DEFINE(thread1, 1024, thread_fn, &t1_0, &t1_1, NULL, HI_PRIO, 0, 0);
+
+void thread_fn(void *t0_arg, void *t1_arg, void *c)
+{
+	uint32_t t0, t1, *t0_out = t0_arg, *t1_out = t1_arg;
+
+	while (true) {
+		k_spinlock_key_t k = k_spin_lock(&lock);
+
+		k_thread_priority_set(k_current_get(), DONE_PRIO);
+
+		ORDER_CHECK();
+		t0 = time();
+		z_swap(&lock, k);
+		t1 = time();
+
+		*t0_out = t0;
+		*t1_out = t1;
+	}
+}
+
+int main(void)
+{
+	uint64_t total = 0, avg8, var8 = 0;
+	size_t n = ARRAY_SIZE(samples);
+
+#ifdef CONFIG_SMP
+	__ASSERT(arch_num_cpus() == 1, "Test requires only one CPU be active");
+#endif
+
+	time_setup();
+	k_thread_priority_set(k_current_get(), MAIN_PRIO);
+
+	/* The threads are launched by the kernel at HI priority, so
+	 * they've already run and are suspended in swap for us.
+	 */
+	for (int i = 0; i < n; i++) {
+		k_sched_lock();
+
+		ORDER_CHECK();
+		k_thread_priority_set(thread0, HI_PRIO);
+		ORDER_CHECK();
+		k_thread_priority_set(thread1, LO_PRIO);
+		ORDER_CHECK();
+
+		/* Now unlock: thread0 will run first, looping around
+		 * and calling z_swap, with an entry timestamp of
+		 * t0_0.  That will swap to thread1 (which timestamps
+		 * its exit as t1_1).  Then we end up back here.
+		 */
+		ORDER_CHECK();
+		k_sched_unlock();
+
+		/* And some complexity: thread1 "woke up" on our cycle
+		 * and stored its exit time in t1_1.  But thread0's
+		 * entry time is still a local variable suspended on
+		 * its stack.  So we pump it once to get it to store
+		 * its output.
+		 */
+		k_thread_priority_set(thread0, HI_PRIO);
+
+		uint32_t dt = time_delta(t0_0, t1_1);
+
+		samples[i] = dt;
+		total += dt;
+	}
+
+	k_thread_abort(thread0);
+	k_thread_abort(thread1);
+
+	/* Do the variance math left-shifted by 8 bits */
+	avg8 = (int)(total * 256 / n);
+	for (int i = 0; i < n; i++) {
+		int d = (int)(samples[i] * 256) - avg8;
+
+		var8 += d * d;
+	}
+
+	double stdev = sqrt((var8 / 256.0) / n);
+	int stdev_i = (int)stdev;
+	int stdev_f = (int)(10 * (stdev - stdev_i));
+
+	printk("Swap average %d stdev %d.%d\n", (int)((avg8 + 128) / 256), stdev_i, stdev_f);
+
+	return 0;
+}
diff --git a/tests/benchmarks/swap/src/time_arm_m.h b/tests/benchmarks/swap/src/time_arm_m.h
new file mode 100644
index 0000000000000..8f246bc7f6201
--- /dev/null
+++ b/tests/benchmarks/swap/src/time_arm_m.h
@@ -0,0 +1,24 @@
+/* Copyright 2025 The ChromiumOS Authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <cmsis_core.h>
+
+static ALWAYS_INLINE void time_setup(void)
+{
+	/* Disable SysTick interrupts so the timer driver doesn't
+	 * interfere, we want the full 24 bit space.
+	 */
+	SysTick->CTRL &= ~SysTick_CTRL_TICKINT_Msk;
+	SysTick->LOAD = 0xffffff;
+}
+
+static ALWAYS_INLINE uint32_t time(void)
+{
+	return SysTick->VAL;
+}
+
+static ALWAYS_INLINE uint32_t time_delta(uint32_t t0, uint32_t t1)
+{
+	/* SysTick counts down, not up! */
+	return (t0 - t1) & 0xffffff;
+}
diff --git a/tests/benchmarks/swap/src/time_generic.h b/tests/benchmarks/swap/src/time_generic.h
new file mode 100644
index 0000000000000..331797d731b88
--- /dev/null
+++ b/tests/benchmarks/swap/src/time_generic.h
@@ -0,0 +1,16 @@
+/* Copyright 2025 The ChromiumOS Authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+static ALWAYS_INLINE void time_setup(void)
+{
+}
+
+static ALWAYS_INLINE uint32_t time(void)
+{
+	return k_cycle_get_32();
+}
+
+static ALWAYS_INLINE uint32_t time_delta(uint32_t t0, uint32_t t1)
+{
+	return t1 - t0;
+}

From 19ba818c1debac4f4ca9dbdb72bd23bbf687df03 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Tue, 29 Jul 2025 13:14:36 -0700
Subject: [PATCH 10/27] arch/arm: Work around gcc codegen oddity in
 arch_switch()

GCC/gas has a code generation bugglet on thumb.  The R7 register is
the ABI-defined frame pointer, though it's usually unused in zephyr
due to -fomit-frame-pointer (and the fact the DWARF on ARM doesn't
really need it).  But when it IS enabled, which sometimes seems to
happen due to toolchain internals, GCC is unable to allow its use in
the clobber list of an asm() block (I guess it can't generate
spill/fill code without using the frame?).

There is existing protection for this problem that sets
-fomit-frame-pointer unconditionally on the two files (sched.c and
init.c) that require it.  But even with that, gcc sometimes gets
kicked back into "framed mode" due to internal state.  Provide a
kconfig workaround that does an explicit spill/fill on the one
test/platform where we have trouble.

(I checked, btw: an ARM clang build appears not to have this
misfeature)

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/Kconfig              |  9 +++++++++
 include/zephyr/arch/arm/arm-m-switch.h      | 11 ++++++++---
 tests/benchmarks/sched_queues/testcase.yaml |  1 +
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/arm/core/cortex_m/Kconfig b/arch/arm/core/cortex_m/Kconfig
index 031bd7180f41f..b506249f40733 100644
--- a/arch/arm/core/cortex_m/Kconfig
+++ b/arch/arm/core/cortex_m/Kconfig
@@ -559,6 +559,15 @@ config CORTEX_M_NULL_POINTER_EXCEPTION_PAGE_SIZE
 
 endif # CORTEX_M_NULL_POINTER_EXCEPTION
 
+config ARM_GCC_FP_WORKAROUND
+	bool "GCC/gas frame pointer workaround"
+	help
+	  Enables a two-instruction workaround for a GNU toolchain
+	  buglet in arch_switch() when the kernel must unavoidably be
+	  compiled with -fno-omit-frame-pointer.  Most code doesn't
+	  want this, but in at least one case it's necessary.  See
+	  comment in arm-m-switch.h
+
 rsource "tz/Kconfig"
 
 endif # CPU_CORTEX_M
diff --git a/include/zephyr/arch/arm/arm-m-switch.h b/include/zephyr/arch/arm/arm-m-switch.h
index 2187856dfcca9..6e4c0c9aa5e6c 100644
--- a/include/zephyr/arch/arm/arm-m-switch.h
+++ b/include/zephyr/arch/arm/arm-m-switch.h
@@ -138,6 +138,7 @@ static ALWAYS_INLINE void arm_m_switch(void *switch_to, void **switched_from)
 	register uint32_t r4 __asm__("r4") = (uint32_t)switch_to;
 	register uint32_t r5 __asm__("r5") = (uint32_t)switched_from;
 	__asm__ volatile(
+		 _R7_CLOBBER_OPT("push {r7};")
 		/* Construct and push a {r12, lr, pc} group at the top
 		 * of the frame, where PC points to the final restore location
 		 * at the end of this sequence.
@@ -215,9 +216,13 @@ static ALWAYS_INLINE void arm_m_switch(void *switch_to, void **switched_from)
 		"pop {pc};"
 
 		"3:" /* Label for restore address */
-		::"r"(r4),
-		"r"(r5)
-		: "r6", "r7", "r8", "r9", "r10", "r11");
+		 _R7_CLOBBER_OPT("pop {r7};")
+		::"r"(r4), "r"(r5)
+		 : "r6", "r8", "r9", "r10",
+#ifndef CONFIG_ARM_GCC_FP_WORKAROUND
+		    "r7",
+#endif
+		    "r11");
 }
 
 #ifdef CONFIG_USE_SWITCH
diff --git a/tests/benchmarks/sched_queues/testcase.yaml b/tests/benchmarks/sched_queues/testcase.yaml
index e60bcb1fef2d2..3813223fd77df 100644
--- a/tests/benchmarks/sched_queues/testcase.yaml
+++ b/tests/benchmarks/sched_queues/testcase.yaml
@@ -28,6 +28,7 @@ tests:
   benchmark.sched_queues.scalable:
     extra_configs:
       - CONFIG_SCHED_SCALABLE=y
+      - CONFIG_ARM_GCC_FP_WORKAROUND=y
 
   benchmark.sched_queues.multiq:
     extra_configs:

From bb918abd552e9ddefdc4a868b29a024e30cabffb Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Tue, 29 Jul 2025 14:26:40 -0700
Subject: [PATCH 11/27] samples/openamp: Bump stack size

Running this with USE_SWITCH=y takes more space as expected, but it
oddly seems variable.  Even at 2kb, I still see fairly regular
failures.  It wants 3kb to be stable.  Maybe the non-determinism of
the two-qemu setup is involved?  Requests batch up and are handled
recursively?  Too much code in the third party stack to make it easy
to tell.

Signed-off-by: Andy Ross <andyross@google.com>
---
 samples/sensor/sensor_shell/prj.conf       | 2 ++
 samples/subsys/ipc/openamp/remote/prj.conf | 1 +
 2 files changed, 3 insertions(+)

diff --git a/samples/sensor/sensor_shell/prj.conf b/samples/sensor/sensor_shell/prj.conf
index d3189e39aeac7..113cf803f5c7f 100644
--- a/samples/sensor/sensor_shell/prj.conf
+++ b/samples/sensor/sensor_shell/prj.conf
@@ -6,3 +6,5 @@ CONFIG_SENSOR_INFO=y
 
 CONFIG_LOG=y
 CONFIG_RTIO_CONSUME_SEM=y
+
+CONFIG_MAIN_STACK_SIZE=2048
diff --git a/samples/subsys/ipc/openamp/remote/prj.conf b/samples/subsys/ipc/openamp/remote/prj.conf
index ffe8ef3422975..85541ba949c82 100644
--- a/samples/subsys/ipc/openamp/remote/prj.conf
+++ b/samples/subsys/ipc/openamp/remote/prj.conf
@@ -6,3 +6,4 @@ CONFIG_HEAP_MEM_POOL_SIZE=4096
 CONFIG_OPENAMP=y
 CONFIG_SEGGER_RTT_BUFFER_SIZE_UP=4096
 CONFIG_OPENAMP_MASTER=n
+CONFIG_MAIN_STACK_SIZE=3172

From dcb40db28cff1f6e9df0e51b146e5e239ac6a3c0 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Mon, 4 Aug 2025 09:48:39 -0700
Subject: [PATCH 12/27] tests/benchmarks/thread_metric: Turn off CONFIG_MPU
 explicitly

This gets turned on at the platform layer by a handful of boards,
despite note being used for anything in the test.  It has performance
impact, so really shouldn't be here.

Note that ARM_MPU also needs an =n, as that will select MPU, seemingly
incorrectly.

Signed-off-by: Andy Ross <andyross@google.com>
---
 tests/benchmarks/thread_metric/prj.conf | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/benchmarks/thread_metric/prj.conf b/tests/benchmarks/thread_metric/prj.conf
index df5648feb8704..bca6a64c16727 100644
--- a/tests/benchmarks/thread_metric/prj.conf
+++ b/tests/benchmarks/thread_metric/prj.conf
@@ -26,3 +26,9 @@ CONFIG_THREAD_LOCAL_STORAGE=n
 
 # Disable memory slab pointer validation
 CONFIG_MEM_SLAB_POINTER_VALIDATE=n
+
+# Turn off MPU explicitly, as some platforms enable it even when
+# unused, and it impacts performance paths needlessly (on arm, ARM_MPU
+# appears to select MPU, so you need to hit both)
+CONFIG_MPU=n
+CONFIG_ARM_MPU=n

From 289653b678c234a48194fef723bac63c6f59fdb0 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Tue, 5 Aug 2025 12:12:04 -0700
Subject: [PATCH 13/27] tests/benchmarks/swap: Add interrupt cases too

This microbenchmark has been super-useful.  Add similar cases looking
at targetting interupt latency:

+ "IRQ" tests the cycles taken from the instant of an interrupt
  trigger (as implemented, it's an arch_irq_lock() being released with
  a timer interrupt pending) through a noop ISR (a k_timer callback),
  and back to the interrupting thread.

+ "IRQ_P" is a similar preemption test: it measures the time taken
   from the same trigger, into a higher-priority thread that
   timestamps the instant of its wakeup (in this case as a return from
   k_sem_take()).

Signed-off-by: Andy Ross <andyross@google.com>
---
 tests/benchmarks/swap/src/main.c | 155 ++++++++++++++++++++++++++++---
 1 file changed, 144 insertions(+), 11 deletions(-)

diff --git a/tests/benchmarks/swap/src/main.c b/tests/benchmarks/swap/src/main.c
index 1b744cbcfaad4..7f9d21cb4e774 100644
--- a/tests/benchmarks/swap/src/main.c
+++ b/tests/benchmarks/swap/src/main.c
@@ -40,6 +40,14 @@
 #error "Don't enable memory management hardware in a microbenchmark!"
 #endif
 
+#if defined(CONFIG_FPU)
+#error "Don't enable FPU/DSP in a microbenchmark!"
+#endif
+
+#if defined(CONFIG_HW_STACK_PROTECTION)
+#error "Don't enable hardware stack protection in a microbenchmark!"
+#endif
+
 #define HI_PRIO   0
 #define LO_PRIO   1
 #define MAIN_PRIO 2
@@ -60,6 +68,30 @@ K_SEM_DEFINE(done_sem, 0, 999);
 K_THREAD_DEFINE(thread0, 1024, thread_fn, &t0_0, &t0_1, NULL, HI_PRIO, 0, 0);
 K_THREAD_DEFINE(thread1, 1024, thread_fn, &t1_0, &t1_1, NULL, HI_PRIO, 0, 0);
 
+void report(const char *name, size_t n)
+{
+	uint64_t total = 0;
+
+	for (int i = 0; i < n; i++) {
+		total += samples[i];
+	}
+
+	float var = 0, avg = total / (float)n;
+
+	for (int i = 0; i < n; i++) {
+		int d = samples[i] - avg;
+
+		var += d * d;
+	}
+
+	float stdev = sqrt(var / n);
+	int iavg = (int)(avg + 0.5f);
+	int stdev_i = (int)stdev;
+	int stdev_f = (int)(10 * (stdev - stdev_i));
+
+	printk("%s samples=%d average %d stdev %d.%d\n", name, n, iavg, stdev_i, stdev_f);
+}
+
 void thread_fn(void *t0_arg, void *t1_arg, void *c)
 {
 	uint32_t t0, t1, *t0_out = t0_arg, *t1_out = t1_arg;
@@ -79,9 +111,8 @@ void thread_fn(void *t0_arg, void *t1_arg, void *c)
 	}
 }
 
-int main(void)
+void swap_bench(void)
 {
-	uint64_t total = 0, avg8, var8 = 0;
 	size_t n = ARRAY_SIZE(samples);
 
 #ifdef CONFIG_SMP
@@ -122,25 +153,127 @@ int main(void)
 		uint32_t dt = time_delta(t0_0, t1_1);
 
 		samples[i] = dt;
-		total += dt;
 	}
 
 	k_thread_abort(thread0);
 	k_thread_abort(thread1);
 
-	/* Do the variance math left-shifted by 8 bits */
-	avg8 = (int)(total * 256 / n);
+	report("SWAP", n);
+}
+
+struct k_timer tm;
+
+K_SEM_DEFINE(hi_sem, 0, 9999);
+uint32_t t_preempt;
+
+void hi_thread_fn(void *a, void *b, void *c)
+{
+	while (true) {
+		k_sem_take(&hi_sem, K_FOREVER);
+		t_preempt = k_cycle_get_32();
+	}
+}
+
+K_THREAD_DEFINE(hi_thread, 1024, hi_thread_fn, NULL, NULL, NULL, -1, 0, 0);
+
+void timer0_fn(struct k_timer *t)
+{
+	/* empty */
+}
+
+/* The hardware devices I have see excursions in interrupt latency I
+ * don't understand.  It looks like more than one is queued up?  Check
+ * that nothing looks weird and retry if it does.  Doing this gives
+ * MUCH lower variance (like 2-3 cycle stdev and not 100+)
+ */
+bool retry(int i)
+{
+	if (i == 0) {
+		return false;
+	}
+
+	uint32_t dt = samples[i], dt0 = samples[i - 1];
+
+	/* Check for >25% delta */
+	if (dt > dt0 && (dt - dt0) > (dt0 / 4)) {
+		return true;
+	}
+	return false;
+}
+
+void irq_bench(void)
+{
+	size_t n = MIN(ARRAY_SIZE(samples), 4 * CONFIG_SYS_CLOCK_TICKS_PER_SEC);
+	uint32_t t0, t1;
+	unsigned int key;
+
+	k_timer_init(&tm, timer0_fn, NULL);
+
 	for (int i = 0; i < n; i++) {
-		int d = (int)(samples[i] * 256) - avg8;
+		/* Lock interrupts before starting the timer, then
+		 * wait for two (!) ticks to roll over to be sure the
+		 * interrupt is queued.
+		 */
+		key = arch_irq_lock();
+		k_timer_start(&tm, K_TICKS(0), K_FOREVER);
+
+		k_busy_wait(k_ticks_to_us_ceil32(3));
+
+		/* Releasing the lock will fire the interrupt synchronously. */
+		t0 = k_cycle_get_32();
+		arch_irq_unlock(key);
+		t1 = k_cycle_get_32();
 
-		var8 += d * d;
+		k_timer_stop(&tm);
+
+		samples[i] = t1 - t0;
+
+		if (retry(i)) {
+			i--;
+		}
 	}
+	report("IRQ", n);
+}
 
-	double stdev = sqrt((var8 / 256.0) / n);
-	int stdev_i = (int)stdev;
-	int stdev_f = (int)(10 * (stdev - stdev_i));
+/* Very similar test, but switches to hi_thread and checks time on
+ * interrupt exit there, to measure preemption overhead
+ */
+void irq_p_bench(void)
+{
+	size_t n = MIN(ARRAY_SIZE(samples), 4 * CONFIG_SYS_CLOCK_TICKS_PER_SEC);
+	uint32_t t0;
+	unsigned int key;
+
+	k_timer_init(&tm, timer0_fn, NULL);
+
+	for (int i = 0; i < n; i++) {
+		key = arch_irq_lock();
+		k_timer_start(&tm, K_TICKS(0), K_FOREVER);
+
+		k_busy_wait(k_ticks_to_us_ceil32(3));
+
+		k_sem_give(&hi_sem);
+		t0 = k_cycle_get_32();
+		arch_irq_unlock(key);
+
+		k_timer_stop(&tm);
+
+		samples[i] = t_preempt - t0;
+
+		if (retry(i)) {
+			i--;
+		}
+	}
+	report("IRQ_P", n);
+}
+
+int main(void)
+{
+	irq_bench();
+	irq_p_bench();
 
-	printk("Swap average %d stdev %d.%d\n", (int)((avg8 + 128) / 256), stdev_i, stdev_f);
+	/* This disables the SysTick interrupt and must be last! */
+	swap_bench();
 
 	return 0;
 }

From 580faf3d130143a2cb5e6d9e511756ee74ab4a50 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Tue, 5 Aug 2025 09:29:23 -0700
Subject: [PATCH 14/27] tests/benchmarks/thread_metric: Don't select
 IRQ_OFFLOAD_NESTED

This is a mistake.  IRQ_OFFLOAD_NESTED is not intended to be an app
tunable. It reflects whether or not platform layer irq_offload()
implementation supports its use from within an ISR (i.e. to invoke a
nested interrupt for testing purposes).  Not all architectures can do
that (and I know of at least one Xtensa soc that didn't give a
software interrupt the right priority and can't either).  Basically
this should have been a "ARCH_HAS_" kind of kconfig, it's just named
funny.

And in any case the thread_metric tests don't do nested interrupts,
which as I understand it aren't even possible in the FreeRTOS
environment it was written on.

Signed-off-by: Andy Ross <andyross@google.com>
---
 tests/benchmarks/thread_metric/Kconfig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/benchmarks/thread_metric/Kconfig b/tests/benchmarks/thread_metric/Kconfig
index 72ebc3042dfbf..9ea39489461d2 100644
--- a/tests/benchmarks/thread_metric/Kconfig
+++ b/tests/benchmarks/thread_metric/Kconfig
@@ -30,7 +30,6 @@ config TM_INTERRUPT
 	bool "Interrupt processing"
 	select TEST
 	select IRQ_OFFLOAD
-	select IRQ_OFFLOAD_NESTED
 	help
 	  The interrupt processing benchmark has a single thread that causes
 	  an interrupt which results in its ISR incrementing a counter and then
@@ -42,7 +41,6 @@ config TM_INTERRUPT_PREEMPTION
 	bool "Interrupt processing preemption"
 	select TEST
 	select IRQ_OFFLOAD
-	select IRQ_OFFLOAD_NESTED
 	help
 	  The interrupt preemption benchmark counts the number of times that
 	  an ISR from a software generated interrupt results in the preemption

From 0c3f9e4b4c71c91a37a5fee6ec7585704fdac17c Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Tue, 5 Aug 2025 14:26:10 -0700
Subject: [PATCH 15/27] kernel/sched: Add optimized next switch handle wrapper

z_get_next_switch_handle() is a clean API, but implementing it as a
(comparatively large) callable function requires significant
entry/exit boilerplate and hides the very common "no switch needed"
early exit condition from the enclosing C code that calls it.  (Most
architectures call this from assembly though and don't notice).

Provide an unwrapped version for the specific needs non-SMP builds.
It's compatible in all other ways.

Slightly ugly, but the gains are significant (like a dozen cycles or
so).

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/arm-m-switch.c |  2 +-
 kernel/include/ksched.h               | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/arm/core/cortex_m/arm-m-switch.c b/arch/arm/core/cortex_m/arm-m-switch.c
index 778416b032a85..c99d9b0e84076 100644
--- a/arch/arm/core/cortex_m/arm-m-switch.c
+++ b/arch/arm/core/cortex_m/arm-m-switch.c
@@ -464,7 +464,7 @@ bool arm_m_must_switch(uint32_t lr)
 	 */
 	unsigned int key = arch_irq_lock();
 
-	void *last, *next = z_get_next_switch_handle(NULL);
+	void *last, *next = z_sched_next_handle(last_thread);
 
 	if (next == NULL) {
 		arch_irq_unlock(key);
diff --git a/kernel/include/ksched.h b/kernel/include/ksched.h
index 171e25cc37beb..fa11f9a1e4768 100644
--- a/kernel/include/ksched.h
+++ b/kernel/include/ksched.h
@@ -63,6 +63,26 @@ int z_unpend_all(_wait_q_t *wait_q);
 bool z_thread_prio_set(struct k_thread *thread, int prio);
 void *z_get_next_switch_handle(void *interrupted);
 
+/* Wrapper around z_get_next_switch_handle() for the benefit of
+ * non-SMP platforms that always pass a NULL interrupted handle.
+ * Exposes the (extremely) common early exit case in a context that
+ * can be (much) better optimized in surrounding C code.  Takes a
+ * _current pointer that the outer scope surely already has to avoid
+ * having to refetch it across a lock.
+ *
+ * Mystic arts for cycle nerds, basically.  Replace with
+ * z_get_next_switch_handle() if this becomes a maintenance hassle.
+ */
+static ALWAYS_INLINE void *z_sched_next_handle(struct k_thread *curr)
+{
+#ifndef CONFIG_SMP
+	if (curr == _kernel.ready_q.cache) {
+		return NULL;
+	}
+#endif
+	return z_get_next_switch_handle(NULL);
+}
+
 void z_time_slice(void);
 void z_reset_time_slice(struct k_thread *curr);
 void z_sched_ipi(void);

From 9be52b7da715d78ce18a4dd53d9a0f0e239b53c0 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 6 Aug 2025 15:24:12 -0700
Subject: [PATCH 16/27] arch/arm: Use an open coded lock and not
 arch_irq_lock()

Micro-optimization: We don't need a full arch_irq_lock(), which is a
~6-instruction sequence on Cortex M.  The lock will be dropped
unconditionally on interrupt exit, so take it unconditionally.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/arm-m-switch.c | 28 +++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/arch/arm/core/cortex_m/arm-m-switch.c b/arch/arm/core/cortex_m/arm-m-switch.c
index c99d9b0e84076..6f798cb2b1d01 100644
--- a/arch/arm/core/cortex_m/arm-m-switch.c
+++ b/arch/arm/core/cortex_m/arm-m-switch.c
@@ -449,25 +449,29 @@ void *arm_m_new_stack(char *base, uint32_t sz, void *entry, void *arg0, void *ar
 
 bool arm_m_must_switch(uint32_t lr)
 {
-	struct k_thread *last_thread = NULL;
-
-	if (IS_ENABLED(CONFIG_MULTITHREADING)) {
-		last_thread = _current;
-	}
-
-	if (!is_thread_return(lr)) {
+	/* Secure mode transistions can push a non-thread frame to the
+	 * stack.  If not enabled, we already know by construction
+	 * that we're handling the bottom level of the interrupt stack
+	 * and returning to thread mode.
+	 */
+	if ((IS_ENABLED(CONFIG_ARM_SECURE_FIRMWARE) ||
+	    IS_ENABLED(CONFIG_ARM_NONSECURE_FIRMWARE))
+	    && !is_thread_return(lr)) {
 		return false;
 	}
 
-	/* This lock is held until the end of the context switch (see
-	 * arch_switch and arm_m_exc_exit)
+	/* This lock is held until the end of the context switch, at
+	 * which point it will be dropped unconditionally. Save a few
+	 * cycles by skipping the needless bits of arch_irq_lock().
 	 */
-	unsigned int key = arch_irq_lock();
+	uint32_t pri = _EXC_IRQ_DEFAULT_PRIO;
 
+	__asm__ volatile("msr basepri, %0" :: "r"(pri));
+
+	struct k_thread *last_thread = last_thread = _current;
 	void *last, *next = z_sched_next_handle(last_thread);
 
 	if (next == NULL) {
-		arch_irq_unlock(key);
 		return false;
 	}
 
@@ -546,8 +550,8 @@ __asm__(".globl arm_m_exc_exit;"
 	"  stm r0, {r4-r11};"    /* out is a switch_frame */
 	"  ldm r1!, {r7-r11};"   /* in is a synth_frame */
 	"  ldm r1, {r4-r6};"
+	"1:\n"
 	"  mov r1, #0;"
 	"  msr basepri, r1;"     /* release lock taken in must_switch */
-	"1:\n"
 	"  bx lr;");
 #endif

From b0eea29a95a79ddf7d40d085f78d7e079009aec4 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 6 Aug 2025 15:09:41 -0700
Subject: [PATCH 17/27] arch/arm: Split Cortex M PendSV vector from SVC

The new switch code no longer needs PendSV, but still calls the SVC
vector.  Split them into separate files for hygiene and a few
microseconds of build time.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/CMakeLists.txt |   4 +-
 arch/arm/core/cortex_m/svc.S          | 266 ++++++++++++++++++++++++++
 arch/arm/core/cortex_m/swap_helper.S  | 246 ------------------------
 3 files changed, 268 insertions(+), 248 deletions(-)
 create mode 100644 arch/arm/core/cortex_m/svc.S

diff --git a/arch/arm/core/cortex_m/CMakeLists.txt b/arch/arm/core/cortex_m/CMakeLists.txt
index 0e5aa94676c26..f9435ebcf9394 100644
--- a/arch/arm/core/cortex_m/CMakeLists.txt
+++ b/arch/arm/core/cortex_m/CMakeLists.txt
@@ -26,7 +26,7 @@ zephyr_library_sources(
   reset.S
   scb.c
   vector_table.S
-  swap_helper.S
+  svc.S
   irq_manage.c
   prep_c.c
   thread.c
@@ -34,7 +34,7 @@ zephyr_library_sources(
   )
 
 zephyr_library_sources_ifdef(CONFIG_USE_SWITCH arm-m-switch.c)
-zephyr_library_sources_ifndef(CONFIG_USE_SWITCH exc_exit.c thread_abort.c)
+zephyr_library_sources_ifndef(CONFIG_USE_SWITCH exc_exit.c thread_abort.c swap_helper.S)
 
 zephyr_library_sources_ifndef(CONFIG_ARM_CUSTOM_INTERRUPT_CONTROLLER irq_init.c)
 zephyr_library_sources_ifdef(CONFIG_GEN_SW_ISR_TABLE isr_wrapper.c)
diff --git a/arch/arm/core/cortex_m/svc.S b/arch/arm/core/cortex_m/svc.S
new file mode 100644
index 0000000000000..34b4e1afd7ddc
--- /dev/null
+++ b/arch/arm/core/cortex_m/svc.S
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2013-2014 Wind River Systems, Inc.
+ * Copyright (c) 2017-2019 Nordic Semiconductor ASA.
+ * Copyright (c) 2020 Stephanos Ioannidis <root@stephanos.io>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @file
+ * @brief SVC exception handling for ARM Cortex-M
+ *
+ * This module implements the z_arm_svc exception vector on ARM Cortex-M CPUs.
+ */
+
+#include <zephyr/toolchain.h>
+#include <offsets_short.h>
+#include <zephyr/arch/cpu.h>
+#include <zephyr/syscall.h>
+
+_ASM_FILE_PROLOGUE
+
+GTEXT(z_arm_svc)
+GTEXT(z_do_kernel_oops)
+#if defined(CONFIG_USERSPACE)
+GTEXT(z_arm_do_syscall)
+#endif
+
+GDATA(_kernel)
+
+/**
+ *
+ * @brief Service call handler
+ *
+ * The service call (svc) is used in the following occasions:
+ * - IRQ offloading
+ * - Kernel run-time exceptions
+ * - System Calls (User mode)
+ *
+ */
+SECTION_FUNC(TEXT, z_arm_svc)
+  /* Use EXC_RETURN state to find out if stack frame is on the
+   * MSP or PSP
+   */
+#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
+  movs r0, #_EXC_RETURN_SPSEL_Msk
+  mov r1, lr
+  tst r1, r0
+  beq .L_stack_frame_msp
+  mrs r0, PSP
+  bne .L_stack_frame_endif
+.L_stack_frame_msp:
+  mrs r0, MSP
+.L_stack_frame_endif:
+#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+    tst lr, #_EXC_RETURN_SPSEL_Msk /* did we come from thread mode ? */
+    ite eq  /* if zero (equal), came from handler mode */
+        mrseq r0, MSP   /* handler mode, stack frame is on MSP */
+        mrsne r0, PSP   /* thread mode, stack frame is on PSP */
+#endif
+
+
+    /* Figure out what SVC call number was invoked */
+
+    ldr r1, [r0, #24]   /* grab address of PC from stack frame */
+    /* SVC is a two-byte instruction, point to it and read the
+     * SVC number (lower byte of SCV instruction)
+     */
+#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
+    subs r1, r1, #2
+    ldrb r1, [r1]
+#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+    ldrb r1, [r1, #-2]
+#endif
+
+   /*
+    * grab service call number:
+    * 0: Unused
+    * 1: irq_offload (if configured)
+    * 2: kernel panic or oops (software generated fatal exception)
+    * 3: System call (if user mode supported)
+    */
+#if defined(CONFIG_USERSPACE)
+    mrs r2, CONTROL
+
+    cmp r1, #3
+    beq .L_do_syscall
+
+    /*
+     * check that we are privileged before invoking other SVCs
+     * oops if we are unprivileged
+     */
+#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
+    movs r3, #0x1
+    tst r2, r3
+#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+    tst r2, #0x1
+#endif
+    bne .L_oops
+
+#endif /* CONFIG_USERSPACE */
+
+    cmp r1, #2
+    beq .L_oops
+
+#if defined(CONFIG_IRQ_OFFLOAD)
+    push {r0, lr}
+    bl z_irq_do_offload  /* call C routine which executes the offload */
+#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
+    pop {r0, r3}
+    mov lr, r3
+#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+    pop {r0, lr}
+#endif
+
+#ifdef CONFIG_USE_SWITCH
+    /* FIXME: this uses the pre-hook LR, but we want to refetch the
+     * one in the stack frame, otherwise irq_offload() won't context
+     * switch correctly (which... isn't tested?)
+     */
+    push {r0, lr}
+    bl arm_m_legacy_exit
+    pop {r0, lr}
+    bx lr
+#else
+    ldr r0, =z_arm_int_exit
+    bx r0
+#endif
+
+#endif
+
+.L_oops:
+    push {r0, lr}
+#if defined(CONFIG_EXTRA_EXCEPTION_INFO)
+#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+    /* Build _callee_saved_t. To match the struct
+     * definition we push the psp & then r11-r4
+     */
+    mrs r1, PSP
+    push {r1, r2}
+    push {r4-r11}
+    mov  r1, sp /* pointer to _callee_saved_t */
+#endif /* CONFIG_ARMV7_M_ARMV8_M_MAINLINE */
+#endif /* CONFIG_EXTRA_EXCEPTION_INFO */
+    mov r2, lr /* EXC_RETURN */
+    bl z_do_kernel_oops
+    /* return from SVC exception is done here */
+#if defined(CONFIG_EXTRA_EXCEPTION_INFO)
+#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+    /* We do not need to restore any register state here
+     * because we did not use any callee-saved registers
+     * in this routine. Therefore, we can just reset
+     * the MSP to its value prior to entering the function
+     */
+    add sp, #40
+#endif /* CONFIG_ARMV7_M_ARMV8_M_MAINLINE */
+#endif /* CONFIG_EXTRA_EXCEPTION_INFO */
+
+#ifdef CONFIG_USE_SWITCH
+    bl arm_m_legacy_exit
+#endif
+
+    pop {r0, pc}
+
+#if defined(CONFIG_USERSPACE)
+    /*
+     * System call will setup a jump to the z_arm_do_syscall() function
+     * when the SVC returns via the bx lr.
+     *
+     * There is some trickery involved here because we have to preserve
+     * the original PC value so that we can return back to the caller of
+     * the SVC.
+     *
+     * On SVC exeption, the stack looks like the following:
+     * r0 - r1 - r2 - r3 - r12 - LR - PC - PSR
+     *
+     * Registers look like:
+     * r0 - arg1
+     * r1 - arg2
+     * r2 - arg3
+     * r3 - arg4
+     * r4 - arg5
+     * r5 - arg6
+     * r6 - call_id
+     * r8 - saved link register
+     */
+.L_do_syscall:
+#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
+    movs r3, #24
+    ldr r1, [r0, r3]   /* grab address of PC from stack frame */
+    mov r8, r1
+#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+    ldr r8, [r0, #24]   /* grab address of PC from stack frame */
+#endif
+    ldr r1, =z_arm_do_syscall
+#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
+    str r1, [r0, r3]   /* overwrite the PC to point to z_arm_do_syscall */
+#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+    str r1, [r0, #24]   /* overwrite the PC to point to z_arm_do_syscall */
+#endif
+
+#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
+    ldr r3, =K_SYSCALL_LIMIT
+    cmp r6, r3
+#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+    /* validate syscall limit */
+    ldr ip, =K_SYSCALL_LIMIT
+    cmp r6, ip
+#endif
+    /* The supplied syscall_id must be lower than the limit
+     * (Requires unsigned integer comparison)
+     */
+    blo .L_valid_syscall_id
+
+    /* bad syscall id.  Set arg1 to bad id and set call_id to SYSCALL_BAD */
+    str r6, [r0]
+    ldr r6, =K_SYSCALL_BAD
+
+    /* Bad syscalls treated as valid syscalls with ID K_SYSCALL_BAD. */
+
+.L_valid_syscall_id:
+    ldr r0, =_kernel
+    ldr r0, [r0, #_kernel_offset_to_current]
+#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
+    mov ip, r2
+    ldr r1, =_thread_offset_to_mode
+    ldr r3, [r0, r1]
+    movs r2, #1
+    bics r3, r2
+    /* Store (privileged) mode in thread's mode state variable */
+    str r3, [r0, r1]
+    mov r2, ip
+    dsb
+    /* set mode to privileged, r2 still contains value from CONTROL */
+    movs r3, #1
+    bics r2, r3
+#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+    ldr r1, [r0, #_thread_offset_to_mode]
+    bic r1, #1
+    /* Store (privileged) mode in thread's mode state variable */
+    str r1, [r0, #_thread_offset_to_mode]
+    dsb
+    /* set mode to privileged, r2 still contains value from CONTROL */
+    bic r2, #1
+#endif
+    msr CONTROL, r2
+
+    /* ISB is not strictly necessary here (stack pointer is not being
+     * touched), but it's recommended to avoid executing pre-fetched
+     * instructions with the previous privilege.
+     */
+    isb
+
+#if defined(CONFIG_BUILTIN_STACK_GUARD)
+    /* Thread is now in privileged mode; after returning from SCVall it
+     * will use the default (user) stack before switching to the privileged
+     * stack to execute the system call. We need to protect the user stack
+     * against stack overflows until this stack transition.
+     */
+    ldr r1, [r0, #_thread_offset_to_stack_info_start]    /* stack_info.start */
+    msr PSPLIM, r1
+#endif /* CONFIG_BUILTIN_STACK_GUARD */
+
+    /* return from SVC to the modified LR - z_arm_do_syscall */
+    bx lr
+#endif /* CONFIG_USERSPACE */
diff --git a/arch/arm/core/cortex_m/swap_helper.S b/arch/arm/core/cortex_m/swap_helper.S
index 0dc4692bc1c64..0aa729a91c336 100644
--- a/arch/arm/core/cortex_m/swap_helper.S
+++ b/arch/arm/core/cortex_m/swap_helper.S
@@ -15,21 +15,12 @@
  */
 
 #include <zephyr/toolchain.h>
-#include <zephyr/linker/sections.h>
 #include <offsets_short.h>
 #include <zephyr/arch/cpu.h>
-#include <zephyr/syscall.h>
-#include <zephyr/kernel.h>
 
-#include <zephyr/arch/arm/cortex_m/cpu.h>
 _ASM_FILE_PROLOGUE
 
-GTEXT(z_arm_svc)
 GTEXT(z_arm_pendsv)
-GTEXT(z_do_kernel_oops)
-#if defined(CONFIG_USERSPACE)
-GTEXT(z_arm_do_syscall)
-#endif
 
 GDATA(_kernel)
 
@@ -346,240 +337,3 @@ SECTION_FUNC(TEXT, z_arm_pendsv)
      * Cortex-M: return from PendSV exception
      */
     bx lr
-
-/**
- *
- * @brief Service call handler
- *
- * The service call (svc) is used in the following occasions:
- * - IRQ offloading
- * - Kernel run-time exceptions
- * - System Calls (User mode)
- *
- */
-SECTION_FUNC(TEXT, z_arm_svc)
-  /* Use EXC_RETURN state to find out if stack frame is on the
-   * MSP or PSP
-   */
-#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
-  movs r0, #_EXC_RETURN_SPSEL_Msk
-  mov r1, lr
-  tst r1, r0
-  beq .L_stack_frame_msp
-  mrs r0, PSP
-  bne .L_stack_frame_endif
-.L_stack_frame_msp:
-  mrs r0, MSP
-.L_stack_frame_endif:
-#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
-    tst lr, #_EXC_RETURN_SPSEL_Msk /* did we come from thread mode ? */
-    ite eq  /* if zero (equal), came from handler mode */
-        mrseq r0, MSP   /* handler mode, stack frame is on MSP */
-        mrsne r0, PSP   /* thread mode, stack frame is on PSP */
-#endif
-
-
-    /* Figure out what SVC call number was invoked */
-
-    ldr r1, [r0, #24]   /* grab address of PC from stack frame */
-    /* SVC is a two-byte instruction, point to it and read the
-     * SVC number (lower byte of SCV instruction)
-     */
-#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
-    subs r1, r1, #2
-    ldrb r1, [r1]
-#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
-    ldrb r1, [r1, #-2]
-#endif
-
-   /*
-    * grab service call number:
-    * 0: Unused
-    * 1: irq_offload (if configured)
-    * 2: kernel panic or oops (software generated fatal exception)
-    * 3: System call (if user mode supported)
-    */
-#if defined(CONFIG_USERSPACE)
-    mrs r2, CONTROL
-
-    cmp r1, #3
-    beq .L_do_syscall
-
-    /*
-     * check that we are privileged before invoking other SVCs
-     * oops if we are unprivileged
-     */
-#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
-    movs r3, #0x1
-    tst r2, r3
-#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
-    tst r2, #0x1
-#endif
-    bne .L_oops
-
-#endif /* CONFIG_USERSPACE */
-
-    cmp r1, #2
-    beq .L_oops
-
-#if defined(CONFIG_IRQ_OFFLOAD)
-    push {r0, lr}
-    bl z_irq_do_offload  /* call C routine which executes the offload */
-#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
-    pop {r0, r3}
-    mov lr, r3
-#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
-    pop {r0, lr}
-#endif
-
-#ifdef CONFIG_USE_SWITCH
-    /* FIXME: this uses the pre-hook LR, but we want to refetch the
-     * one in the stack frame, otherwise irq_offload() won't context
-     * switch correctly (which... isn't tested?)
-     */
-    push {r0, lr}
-    bl arm_m_legacy_exit
-    pop {r0, lr}
-    bx lr
-#else
-    ldr r0, =z_arm_int_exit
-    bx r0
-#endif
-
-#endif
-
-.L_oops:
-    push {r0, lr}
-#if defined(CONFIG_EXTRA_EXCEPTION_INFO)
-#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
-    /* Build _callee_saved_t. To match the struct
-     * definition we push the psp & then r11-r4
-     */
-    mrs r1, PSP
-    push {r1, r2}
-    push {r4-r11}
-    mov  r1, sp /* pointer to _callee_saved_t */
-#endif /* CONFIG_ARMV7_M_ARMV8_M_MAINLINE */
-#endif /* CONFIG_EXTRA_EXCEPTION_INFO */
-    mov r2, lr /* EXC_RETURN */
-    bl z_do_kernel_oops
-    /* return from SVC exception is done here */
-#if defined(CONFIG_EXTRA_EXCEPTION_INFO)
-#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
-    /* We do not need to restore any register state here
-     * because we did not use any callee-saved registers
-     * in this routine. Therefore, we can just reset
-     * the MSP to its value prior to entering the function
-     */
-    add sp, #40
-#endif /* CONFIG_ARMV7_M_ARMV8_M_MAINLINE */
-#endif /* CONFIG_EXTRA_EXCEPTION_INFO */
-
-#ifdef CONFIG_USE_SWITCH
-    bl arm_m_legacy_exit
-#endif
-
-    pop {r0, pc}
-
-#if defined(CONFIG_USERSPACE)
-    /*
-     * System call will setup a jump to the z_arm_do_syscall() function
-     * when the SVC returns via the bx lr.
-     *
-     * There is some trickery involved here because we have to preserve
-     * the original PC value so that we can return back to the caller of
-     * the SVC.
-     *
-     * On SVC exeption, the stack looks like the following:
-     * r0 - r1 - r2 - r3 - r12 - LR - PC - PSR
-     *
-     * Registers look like:
-     * r0 - arg1
-     * r1 - arg2
-     * r2 - arg3
-     * r3 - arg4
-     * r4 - arg5
-     * r5 - arg6
-     * r6 - call_id
-     * r8 - saved link register
-     */
-.L_do_syscall:
-#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
-    movs r3, #24
-    ldr r1, [r0, r3]   /* grab address of PC from stack frame */
-    mov r8, r1
-#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
-    ldr r8, [r0, #24]   /* grab address of PC from stack frame */
-#endif
-    ldr r1, =z_arm_do_syscall
-#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
-    str r1, [r0, r3]   /* overwrite the PC to point to z_arm_do_syscall */
-#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
-    str r1, [r0, #24]   /* overwrite the PC to point to z_arm_do_syscall */
-#endif
-
-#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
-    ldr r3, =K_SYSCALL_LIMIT
-    cmp r6, r3
-#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
-    /* validate syscall limit */
-    ldr ip, =K_SYSCALL_LIMIT
-    cmp r6, ip
-#endif
-    /* The supplied syscall_id must be lower than the limit
-     * (Requires unsigned integer comparison)
-     */
-    blo .L_valid_syscall_id
-
-    /* bad syscall id.  Set arg1 to bad id and set call_id to SYSCALL_BAD */
-    str r6, [r0]
-    ldr r6, =K_SYSCALL_BAD
-
-    /* Bad syscalls treated as valid syscalls with ID K_SYSCALL_BAD. */
-
-.L_valid_syscall_id:
-    ldr r0, =_kernel
-    ldr r0, [r0, #_kernel_offset_to_current]
-#if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
-    mov ip, r2
-    ldr r1, =_thread_offset_to_mode
-    ldr r3, [r0, r1]
-    movs r2, #1
-    bics r3, r2
-    /* Store (privileged) mode in thread's mode state variable */
-    str r3, [r0, r1]
-    mov r2, ip
-    dsb
-    /* set mode to privileged, r2 still contains value from CONTROL */
-    movs r3, #1
-    bics r2, r3
-#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
-    ldr r1, [r0, #_thread_offset_to_mode]
-    bic r1, #1
-    /* Store (privileged) mode in thread's mode state variable */
-    str r1, [r0, #_thread_offset_to_mode]
-    dsb
-    /* set mode to privileged, r2 still contains value from CONTROL */
-    bic r2, #1
-#endif
-    msr CONTROL, r2
-
-    /* ISB is not strictly necessary here (stack pointer is not being
-     * touched), but it's recommended to avoid executing pre-fetched
-     * instructions with the previous privilege.
-     */
-    isb
-
-#if defined(CONFIG_BUILTIN_STACK_GUARD)
-    /* Thread is now in privileged mode; after returning from SCVall it
-     * will use the default (user) stack before switching to the privileged
-     * stack to execute the system call. We need to protect the user stack
-     * against stack overflows until this stack transition.
-     */
-    ldr r1, [r0, #_thread_offset_to_stack_info_start]    /* stack_info.start */
-    msr PSPLIM, r1
-#endif /* CONFIG_BUILTIN_STACK_GUARD */
-
-    /* return from SVC to the modified LR - z_arm_do_syscall */
-    bx lr
-#endif /* CONFIG_USERSPACE */

From 15e71884a552760fc1bf52c3dd0fd3e281cac129 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 6 Aug 2025 15:10:23 -0700
Subject: [PATCH 18/27] arch/arm: Remove unused Cortex M thread struct elements

When USE_SWITCH=y, the thread struct is now mostly degenerate.  Only
the two words for ICI/IT state tracking are required.  Eliminate all
the extra fields when not needed and save a bunch of SRAM.

Note a handful of spots in coredump/debug that need a location for the
new stack pointer (stored as the switch handle now) are also updated.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/coredump.c            |  9 ++++++++-
 arch/arm/core/cortex_m/thread.c              |  2 ++
 arch/arm/core/cortex_m/vector_table.S        |  2 +-
 arch/arm/core/offsets/offsets_aarch32.c      |  2 ++
 arch/arm/include/cortex_m/kernel_arch_func.h |  2 ++
 include/zephyr/arch/arm/thread.h             | 15 ++++++++++++++-
 subsys/debug/thread_info.c                   |  2 ++
 7 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/arch/arm/core/cortex_m/coredump.c b/arch/arm/core/cortex_m/coredump.c
index 787cfcdf4a751..34ba08f60df9e 100644
--- a/arch/arm/core/cortex_m/coredump.c
+++ b/arch/arm/core/cortex_m/coredump.c
@@ -100,5 +100,12 @@ uint16_t arch_coredump_tgt_code_get(void)
 
 uintptr_t arch_coredump_stack_ptr_get(const struct k_thread *thread)
 {
-	return (thread == _current) ? z_arm_coredump_fault_sp : thread->callee_saved.psp;
+	if (thread == _current) {
+		return z_arm_coredump_fault_sp;
+	}
+#ifdef CONFIG_USE_SWITCH
+	return (uintptr_t)thread->switch_handle;
+#else
+	return thread->callee_saved.psp;
+#endif
 }
diff --git a/arch/arm/core/cortex_m/thread.c b/arch/arm/core/cortex_m/thread.c
index 25606a6b919eb..34e5f827c4f09 100644
--- a/arch/arm/core/cortex_m/thread.c
+++ b/arch/arm/core/cortex_m/thread.c
@@ -112,7 +112,9 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, char *sta
 	thread->callee_saved.psp = (uint32_t)iframe;
 #endif
 
+#ifndef CONFIG_USE_SWITCH
 	thread->arch.basepri = 0;
+#endif
 
 #if defined(CONFIG_ARM_STORE_EXC_RETURN) || defined(CONFIG_USERSPACE)
 	thread->arch.mode = 0;
diff --git a/arch/arm/core/cortex_m/vector_table.S b/arch/arm/core/cortex_m/vector_table.S
index 226cd636caf5b..5a43f977ab6af 100644
--- a/arch/arm/core/cortex_m/vector_table.S
+++ b/arch/arm/core/cortex_m/vector_table.S
@@ -74,7 +74,7 @@ SECTION_SUBSEC_FUNC(exc_vector_table,_vector_table_section,_vector_table)
 #error Unknown ARM architecture
 #endif /* CONFIG_ARMV6_M_ARMV8_M_BASELINE */
     .word 0
-#if defined(CONFIG_MULTITHREADING) && !defined(USE_SWITCH)
+#if defined(CONFIG_MULTITHREADING) && !defined(CONFIG_USE_SWITCH)
     .word z_arm_pendsv
 #else
     .word z_arm_exc_spurious
diff --git a/arch/arm/core/offsets/offsets_aarch32.c b/arch/arm/core/offsets/offsets_aarch32.c
index 693546630b05e..5147ced51eb2c 100644
--- a/arch/arm/core/offsets/offsets_aarch32.c
+++ b/arch/arm/core/offsets/offsets_aarch32.c
@@ -29,8 +29,10 @@
 #include <kernel_arch_data.h>
 #include <kernel_offsets.h>
 
+#if !(defined(CONFIG_CPU_CORTEX_M) && defined(CONFIG_USE_SWITCH))
 GEN_OFFSET_SYM(_thread_arch_t, basepri);
 GEN_OFFSET_SYM(_thread_arch_t, swap_return_value);
+#endif
 
 #if defined(CONFIG_CPU_AARCH32_CORTEX_A) || defined(CONFIG_CPU_AARCH32_CORTEX_R)
 GEN_OFFSET_SYM(_thread_arch_t, exception_depth);
diff --git a/arch/arm/include/cortex_m/kernel_arch_func.h b/arch/arm/include/cortex_m/kernel_arch_func.h
index befc8ae20071c..f3e2d35b98c74 100644
--- a/arch/arm/include/cortex_m/kernel_arch_func.h
+++ b/arch/arm/include/cortex_m/kernel_arch_func.h
@@ -83,6 +83,7 @@ extern FUNC_NORETURN void z_arm_userspace_enter(k_thread_entry_t user_entry, voi
 
 extern void z_arm_fatal_error(unsigned int reason, const struct arch_esf *esf);
 
+#ifndef CONFIG_USE_SWITCH
 static ALWAYS_INLINE int arch_swap(unsigned int key)
 {
 	/* store off key and return value */
@@ -100,6 +101,7 @@ static ALWAYS_INLINE int arch_swap(unsigned int key)
 	 */
 	return _current->arch.swap_return_value;
 }
+#endif
 
 #endif /* _ASMLANGUAGE */
 
diff --git a/include/zephyr/arch/arm/thread.h b/include/zephyr/arch/arm/thread.h
index 3e73634d7b3fe..075a00f623e06 100644
--- a/include/zephyr/arch/arm/thread.h
+++ b/include/zephyr/arch/arm/thread.h
@@ -22,7 +22,15 @@
 #ifndef _ASMLANGUAGE
 #include <zephyr/types.h>
 
+/* Cortex M's USE_SWITCH implementation is somewhat unique and doesn't
+ * use much of the thread struct
+ */
+#if defined(CONFIG_CPU_CORTEX_M) && defined(CONFIG_USE_SWITCH)
+#define _ARM_M_SWITCH
+#endif
+
 struct _callee_saved {
+#ifndef _ARM_M_SWITCH
 	uint32_t v1;  /* r4 */
 	uint32_t v2;  /* r5 */
 	uint32_t v3;  /* r6 */
@@ -35,12 +43,14 @@ struct _callee_saved {
 #ifdef CONFIG_USE_SWITCH
 	uint32_t lr;  /* lr */
 #endif
+#endif /* !_ARM_M_SWITCH */
 };
 
 typedef struct _callee_saved _callee_saved_t;
 
 #if defined(CONFIG_FPU) && defined(CONFIG_FPU_SHARING)
 struct _preempt_float {
+#ifndef _ARM_M_SWITCH
 	float  s16;
 	float  s17;
 	float  s18;
@@ -57,18 +67,21 @@ struct _preempt_float {
 	float  s29;
 	float  s30;
 	float  s31;
+#endif /* !_ARM_M_SWITCH */
 };
 #endif
 
 struct _thread_arch {
 
+#ifndef _ARM_M_SWITCH
 	/* interrupt locking key */
 	uint32_t basepri;
 
 	/* r0 in stack frame cannot be written to reliably */
 	uint32_t swap_return_value;
+#endif
 
-#if defined(CONFIG_USE_SWITCH) && defined(CONFIG_CPU_CORTEX_M)
+#ifdef _ARM_M_SWITCH
 	uint32_t iciit_pc;
 	uint32_t iciit_apsr;
 #endif
diff --git a/subsys/debug/thread_info.c b/subsys/debug/thread_info.c
index bcd58211e3bb8..ac5453ced8c40 100644
--- a/subsys/debug/thread_info.c
+++ b/subsys/debug/thread_info.c
@@ -57,6 +57,8 @@ const size_t _kernel_thread_info_offsets[] = {
 	/* We are assuming that the SP of interest is SP_EL1 */
 	[THREAD_INFO_OFFSET_T_STACK_PTR] = offsetof(struct k_thread,
 						callee_saved.sp_elx),
+#elif defined(CONFIG_CPU_CORTEX_M) && defined(CONFIG_USE_SWITCH)
+	[THREAD_INFO_OFFSET_T_STACK_PTR] = offsetof(struct k_thread, switch_handle),
 #elif defined(CONFIG_ARM)
 	[THREAD_INFO_OFFSET_T_STACK_PTR] = offsetof(struct k_thread,
 						callee_saved.psp),

From 673c221f519b4203c594f591315c9a8b6004920f Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 6 Aug 2025 11:59:45 -0700
Subject: [PATCH 19/27] kernel/sched: Refactor reschedule to permit better code
 generation

z_reschedule() is the basic kernel entry point for context switch,
wrapping z_swap(), and thence arch_switch().  It's currently defined
as a first class function for entry from other files in the kernel and
elsewhere (e.g. IPC library code).

But in practice it's actually a very thin wrapper without a lot of
logic of its own, and the context switch layers of some of the more
obnoxiously clever architectures are designed to interoperate with the
compiler's own spill/fill logic to avoid double saving.  And with a
small z_reschedule() there's not a lot to work with.

Make reschedule() an inlinable static, so the compiler has more
options.

Signed-off-by: Andy Ross <andyross@google.com>
---
 kernel/sched.c | 77 +++++++++++++++++++++++++++-----------------------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 9a50452cf1f1a..c2ce034d1cf8d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -492,6 +492,43 @@ static inline void z_vrfy_k_thread_suspend(k_tid_t thread)
 #include <zephyr/syscalls/k_thread_suspend_mrsh.c>
 #endif /* CONFIG_USERSPACE */
 
+static inline bool resched(uint32_t key)
+{
+#ifdef CONFIG_SMP
+	_current_cpu->swap_ok = 0;
+#endif /* CONFIG_SMP */
+
+	return arch_irq_unlocked(key) && !arch_is_in_isr();
+}
+
+/*
+ * Check if the next ready thread is the same as the current thread
+ * and save the trip if true.
+ */
+static inline bool need_swap(void)
+{
+	/* the SMP case will be handled in C based z_swap() */
+#ifdef CONFIG_SMP
+	return true;
+#else
+	struct k_thread *new_thread;
+
+	/* Check if the next ready thread is the same as the current thread */
+	new_thread = _kernel.ready_q.cache;
+	return new_thread != _current;
+#endif /* CONFIG_SMP */
+}
+
+static void reschedule(struct k_spinlock *lock, k_spinlock_key_t key)
+{
+	if (resched(key.key) && need_swap()) {
+		z_swap(lock, key);
+	} else {
+		k_spin_unlock(lock, key);
+		signal_pending_ipi();
+	}
+}
+
 void z_impl_k_thread_resume(k_tid_t thread)
 {
 	SYS_PORT_TRACING_OBJ_FUNC_ENTER(k_thread, resume, thread);
@@ -507,7 +544,7 @@ void z_impl_k_thread_resume(k_tid_t thread)
 	z_mark_thread_as_not_suspended(thread);
 	ready_thread(thread);
 
-	z_reschedule(&_sched_spinlock, key);
+	reschedule(&_sched_spinlock, key);
 
 	SYS_PORT_TRACING_OBJ_FUNC_EXIT(k_thread, resume, thread);
 }
@@ -719,41 +756,9 @@ bool z_thread_prio_set(struct k_thread *thread, int prio)
 	return need_sched;
 }
 
-static inline bool resched(uint32_t key)
-{
-#ifdef CONFIG_SMP
-	_current_cpu->swap_ok = 0;
-#endif /* CONFIG_SMP */
-
-	return arch_irq_unlocked(key) && !arch_is_in_isr();
-}
-
-/*
- * Check if the next ready thread is the same as the current thread
- * and save the trip if true.
- */
-static inline bool need_swap(void)
-{
-	/* the SMP case will be handled in C based z_swap() */
-#ifdef CONFIG_SMP
-	return true;
-#else
-	struct k_thread *new_thread;
-
-	/* Check if the next ready thread is the same as the current thread */
-	new_thread = _kernel.ready_q.cache;
-	return new_thread != _current;
-#endif /* CONFIG_SMP */
-}
-
 void z_reschedule(struct k_spinlock *lock, k_spinlock_key_t key)
 {
-	if (resched(key.key) && need_swap()) {
-		z_swap(lock, key);
-	} else {
-		k_spin_unlock(lock, key);
-		signal_pending_ipi();
-	}
+	reschedule(lock, key);
 }
 
 void z_reschedule_irqlock(uint32_t key)
@@ -1044,7 +1049,7 @@ void z_impl_k_reschedule(void)
 
 	update_cache(0);
 
-	z_reschedule(&_sched_spinlock, key);
+	reschedule(&_sched_spinlock, key);
 }
 
 #ifdef CONFIG_USERSPACE
@@ -1185,7 +1190,7 @@ void z_impl_k_wakeup(k_tid_t thread)
 		z_abort_thread_timeout(thread);
 		z_mark_thread_as_not_sleeping(thread);
 		ready_thread(thread);
-		z_reschedule(&_sched_spinlock, key);
+		reschedule(&_sched_spinlock, key);
 	} else {
 		k_spin_unlock(&_sched_spinlock, key);
 	}

From ef085e6e1983a69174fa973d8020595af19b14c6 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 6 Aug 2025 12:14:16 -0700
Subject: [PATCH 20/27] kernel/sched: Move reschedule under lock in
 k_sched_unlock()

This function was a little clumsy, taking the scheduler lock,
releasing it, and then calling z_reschedule_unlocked() instead of the
normal locked variant of reschedule.  Don't take the lock twice.

Mostly this is a code size and hygiene win.  Obviously the sched lock
is not normally a performance path, but I happened to have picked this
API for my own microbenchmark in tests/benchmarks/swap and so noticed
the double-lock while staring at disassembly.

Signed-off-by: Andy Ross <andyross@google.com>
---
 kernel/sched.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index c2ce034d1cf8d..066d5f0e4503e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -787,20 +787,18 @@ void k_sched_lock(void)
 
 void k_sched_unlock(void)
 {
-	K_SPINLOCK(&_sched_spinlock) {
-		__ASSERT(_current->base.sched_locked != 0U, "");
-		__ASSERT(!arch_is_in_isr(), "");
-
-		++_current->base.sched_locked;
-		update_cache(0);
-	}
-
 	LOG_DBG("scheduler unlocked (%p:%d)",
 		_current, _current->base.sched_locked);
 
 	SYS_PORT_TRACING_FUNC(k_thread, sched_unlock);
 
-	z_reschedule_unlocked();
+	k_spinlock_key_t key = k_spin_lock(&_sched_spinlock);
+
+	__ASSERT(_current->base.sched_locked != 0U, "");
+	__ASSERT(!arch_is_in_isr(), "");
+	++_current->base.sched_locked;
+	update_cache(0);
+	reschedule(&_sched_spinlock, key);
 }
 
 struct k_thread *z_swap_next_thread(void)

From 741f7a5e70c2082258319b567868fa36d83547d7 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 6 Aug 2025 14:20:57 -0700
Subject: [PATCH 21/27] arch/arm: Reorganize arm-m interrupt exit for
 performance

Some nitpicky hand-optimizations, no logic changes:

+ Shrink the assembly entry to put more of the logic into
  compiler-optimizable C.

+ Split arm_m_must_switch() into two functions so that the first
  doesn't look so big to the compiler.  That allows it to spill (many)
  fewer register on entry and speeds the (very) common early-exit case
  where an interrupt returns without context switch.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/arm-m-switch.c | 45 ++++++++++++++++-----------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/arch/arm/core/cortex_m/arm-m-switch.c b/arch/arm/core/cortex_m/arm-m-switch.c
index 6f798cb2b1d01..06cb83069fe82 100644
--- a/arch/arm/core/cortex_m/arm-m-switch.c
+++ b/arch/arm/core/cortex_m/arm-m-switch.c
@@ -447,8 +447,18 @@ void *arm_m_new_stack(char *base, uint32_t sz, void *entry, void *arg0, void *ar
 #endif
 }
 
-bool arm_m_must_switch(uint32_t lr)
+bool arm_m_do_switch(struct k_thread *last_thread, void *next);
+
+bool arm_m_must_switch(void)
 {
+	/* This lock is held until the end of the context switch, at
+	 * which point it will be dropped unconditionally. Save a few
+	 * cycles by skipping the needless bits of arch_irq_lock().
+	 */
+	uint32_t pri = _EXC_IRQ_DEFAULT_PRIO;
+
+	__asm__ volatile("msr basepri, %0" :: "r"(pri));
+
 	/* Secure mode transistions can push a non-thread frame to the
 	 * stack.  If not enabled, we already know by construction
 	 * that we're handling the bottom level of the interrupt stack
@@ -456,26 +466,25 @@ bool arm_m_must_switch(uint32_t lr)
 	 */
 	if ((IS_ENABLED(CONFIG_ARM_SECURE_FIRMWARE) ||
 	    IS_ENABLED(CONFIG_ARM_NONSECURE_FIRMWARE))
-	    && !is_thread_return(lr)) {
+	    && !is_thread_return((uint32_t)arm_m_cs_ptrs.lr_save)) {
 		return false;
 	}
 
-	/* This lock is held until the end of the context switch, at
-	 * which point it will be dropped unconditionally. Save a few
-	 * cycles by skipping the needless bits of arch_irq_lock().
-	 */
-	uint32_t pri = _EXC_IRQ_DEFAULT_PRIO;
-
-	__asm__ volatile("msr basepri, %0" :: "r"(pri));
-
-	struct k_thread *last_thread = last_thread = _current;
-	void *last, *next = z_sched_next_handle(last_thread);
+	struct k_thread *last_thread = _current;
+	void *next = z_sched_next_handle(last_thread);
 
 	if (next == NULL) {
 		return false;
 	}
 
-	bool fpu = fpu_state_pushed(lr);
+	arm_m_do_switch(last_thread, next);
+	return true;
+}
+
+bool arm_m_do_switch(struct k_thread *last_thread, void *next)
+{
+	void *last;
+	bool fpu = fpu_state_pushed((uint32_t)arm_m_cs_ptrs.lr_save);
 
 	__asm__ volatile("mrs %0, psp" : "=r"(last));
 	last = arm_m_cpu_to_switch(last_thread, last, fpu);
@@ -539,19 +548,17 @@ void arm_m_legacy_exit(void)
 #ifdef CONFIG_MULTITHREADING
 __asm__(".globl arm_m_exc_exit;"
 	"arm_m_exc_exit:;"
-	"  ldr r2, =arm_m_cs_ptrs;"
-	"  ldr r0, [r2, #8];"    /* lr_save as argument */
 	"  bl arm_m_must_switch;"
 	"  ldr r2, =arm_m_cs_ptrs;"
-	"  ldr lr, [r2, #8];"    /* refetch lr_save as default lr */
+	"  mov r3, #0;"
+	"  ldr lr, [r2, #8];"    /* lr_save */
 	"  cbz r0, 1f;"
-	"  ldm r2, {r0, r1};"    /* fields: out, in */
 	"  mov lr, #0xfffffffd;" /* integer-only LR */
+	"  ldm r2, {r0, r1};"    /* fields: out, in */
 	"  stm r0, {r4-r11};"    /* out is a switch_frame */
 	"  ldm r1!, {r7-r11};"   /* in is a synth_frame */
 	"  ldm r1, {r4-r6};"
 	"1:\n"
-	"  mov r1, #0;"
-	"  msr basepri, r1;"     /* release lock taken in must_switch */
+	"  msr basepri, r3;"     /* release lock taken in must_switch */
 	"  bx lr;");
 #endif

From 68a2736bd58353e2766f33448b7d60c5f67e291b Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 6 Aug 2025 16:20:53 -0700
Subject: [PATCH 22/27] arch/arm: clang-format changes

Late-arriving clang-format-demanded changes that are too hard to split
and squash into the original patches.  No behavior changes.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/arm-m-switch.c  |  15 ++-
 include/zephyr/arch/arm/arm-m-switch.h | 125 ++++++++++++-------------
 2 files changed, 69 insertions(+), 71 deletions(-)

diff --git a/arch/arm/core/cortex_m/arm-m-switch.c b/arch/arm/core/cortex_m/arm-m-switch.c
index 06cb83069fe82..eaefc856a41d8 100644
--- a/arch/arm/core/cortex_m/arm-m-switch.c
+++ b/arch/arm/core/cortex_m/arm-m-switch.c
@@ -233,7 +233,7 @@ static void iciit_fixup(struct k_thread *th, struct hw_frame_base *hw, uint32_t
 		/* Stash original return address, replace with hook */
 		th->arch.iciit_pc = hw->pc;
 		th->arch.iciit_apsr = hw->apsr;
-		hw->pc = (uint32_t) arm_m_iciit_stub;
+		hw->pc = (uint32_t)arm_m_iciit_stub;
 	}
 #endif
 }
@@ -413,7 +413,7 @@ void *arm_m_new_stack(char *base, uint32_t sz, void *entry, void *arg0, void *ar
 	uint32_t *s_top = (uint32_t *)(stack + K_KERNEL_STACK_SIZEOF(z_interrupt_stacks[0]));
 
 	arm_m_exc_lr_ptr = &s_top[-1];
-	arm_m_cs_ptrs.lr_fixup = (void *) (1 | (uint32_t)arm_m_exc_exit); /* thumb bit! */
+	arm_m_cs_ptrs.lr_fixup = (void *)(1 | (uint32_t)arm_m_exc_exit); /* thumb bit! */
 #endif
 
 	baddr = ((uint32_t)base + 7) & ~7;
@@ -457,16 +457,15 @@ bool arm_m_must_switch(void)
 	 */
 	uint32_t pri = _EXC_IRQ_DEFAULT_PRIO;
 
-	__asm__ volatile("msr basepri, %0" :: "r"(pri));
+	__asm__ volatile("msr basepri, %0" ::"r"(pri));
 
 	/* Secure mode transistions can push a non-thread frame to the
 	 * stack.  If not enabled, we already know by construction
 	 * that we're handling the bottom level of the interrupt stack
 	 * and returning to thread mode.
 	 */
-	if ((IS_ENABLED(CONFIG_ARM_SECURE_FIRMWARE) ||
-	    IS_ENABLED(CONFIG_ARM_NONSECURE_FIRMWARE))
-	    && !is_thread_return((uint32_t)arm_m_cs_ptrs.lr_save)) {
+	if ((IS_ENABLED(CONFIG_ARM_SECURE_FIRMWARE) || IS_ENABLED(CONFIG_ARM_NONSECURE_FIRMWARE)) &&
+	    !is_thread_return((uint32_t)arm_m_cs_ptrs.lr_save)) {
 		return false;
 	}
 
@@ -551,7 +550,7 @@ __asm__(".globl arm_m_exc_exit;"
 	"  bl arm_m_must_switch;"
 	"  ldr r2, =arm_m_cs_ptrs;"
 	"  mov r3, #0;"
-	"  ldr lr, [r2, #8];"    /* lr_save */
+	"  ldr lr, [r2, #8];" /* lr_save */
 	"  cbz r0, 1f;"
 	"  mov lr, #0xfffffffd;" /* integer-only LR */
 	"  ldm r2, {r0, r1};"    /* fields: out, in */
@@ -559,6 +558,6 @@ __asm__(".globl arm_m_exc_exit;"
 	"  ldm r1!, {r7-r11};"   /* in is a synth_frame */
 	"  ldm r1, {r4-r6};"
 	"1:\n"
-	"  msr basepri, r3;"     /* release lock taken in must_switch */
+	"  msr basepri, r3;" /* release lock taken in must_switch */
 	"  bx lr;");
 #endif
diff --git a/include/zephyr/arch/arm/arm-m-switch.h b/include/zephyr/arch/arm/arm-m-switch.h
index 6e4c0c9aa5e6c..2780425b34e66 100644
--- a/include/zephyr/arch/arm/arm-m-switch.h
+++ b/include/zephyr/arch/arm/arm-m-switch.h
@@ -90,14 +90,14 @@ static inline void arm_m_exc_tail(void)
 	 * our bookeeping around EXC_RETURN, so do it early.
 	 */
 	void z_check_stack_sentinel(void);
-	void *isr_lr = (void *) *arm_m_exc_lr_ptr;
+	void *isr_lr = (void *)*arm_m_exc_lr_ptr;
 
 	if (IS_ENABLED(CONFIG_STACK_SENTINEL)) {
 		z_check_stack_sentinel();
 	}
 	if (isr_lr != arm_m_cs_ptrs.lr_fixup) {
 		arm_m_cs_ptrs.lr_save = isr_lr;
-		*arm_m_exc_lr_ptr = (uint32_t) arm_m_cs_ptrs.lr_fixup;
+		*arm_m_exc_lr_ptr = (uint32_t)arm_m_cs_ptrs.lr_fixup;
 	}
 #endif
 }
@@ -137,64 +137,63 @@ static ALWAYS_INLINE void arm_m_switch(void *switch_to, void **switched_from)
 	 */
 	register uint32_t r4 __asm__("r4") = (uint32_t)switch_to;
 	register uint32_t r5 __asm__("r5") = (uint32_t)switched_from;
-	__asm__ volatile(
-		 _R7_CLOBBER_OPT("push {r7};")
-		/* Construct and push a {r12, lr, pc} group at the top
-		 * of the frame, where PC points to the final restore location
-		 * at the end of this sequence.
-		 */
-		"mov r6, r12;"
-		"mov r7, lr;"
-		"ldr r8, =3f;"    /* address of restore PC */
-		"add r8, r8, #1;" /* set thumb bit */
-		"push {r6-r8};"
-		"sub sp, sp, #24;" /* skip over space for r6-r11 */
-		"push {r0-r5};"
-		"mov r2, #0x01000000;" /* APSR (only care about thumb bit) */
-		"mov r0, #0;"          /* Leave r0 zero for code blow */
+	__asm__ volatile(_R7_CLOBBER_OPT("push {r7};")
+			 /* Construct and push a {r12, lr, pc} group at the top
+			  * of the frame, where PC points to the final restore location
+			  * at the end of this sequence.
+			  */
+			 "mov r6, r12;"
+			 "mov r7, lr;"
+			 "ldr r8, =3f;"    /* address of restore PC */
+			 "add r8, r8, #1;" /* set thumb bit */
+			 "push {r6-r8};"
+			 "sub sp, sp, #24;" /* skip over space for r6-r11 */
+			 "push {r0-r5};"
+			 "mov r2, #0x01000000;" /* APSR (only care about thumb bit) */
+			 "mov r0, #0;"          /* Leave r0 zero for code blow */
 #ifdef CONFIG_BUILTIN_STACK_GUARD
-		"mrs r1, psplim;"
-		"push {r1-r2};"
-		"msr psplim, r0;" /* zero it so we can move the stack */
+			 "mrs r1, psplim;"
+			 "push {r1-r2};"
+			 "msr psplim, r0;" /* zero it so we can move the stack */
 #else
-		"push {r2};"
+			 "push {r2};"
 #endif
 
 #ifdef CONFIG_FPU
-		/* Push FPU state (if active) to our outgoing stack */
-		"   mrs r8, control;" /* read CONTROL.FPCA */
-		"   and r7, r8, #4;"  /* r7 == have_fpu */
-		"   cbz r7, 1f;"
-		"   bic r8, r8, #4;" /* clear CONTROL.FPCA */
-		"   msr control, r8;"
-		"   vmrs r6, fpscr;"
-		"   push {r6};"
-		"   vpush {s0-s31};"
-		"1: push {r7};" /* have_fpu word */
-
-		/* Pop FPU state (if present) from incoming frame in r4 */
-		"   ldm r4!, {r7};" /* have_fpu word */
-		"   cbz r7, 2f;"
-		"   vldm r4!, {s0-s31};" /* (note: sets FPCA bit for us) */
-		"   ldm r4!, {r6};"
-		"   vmsr fpscr, r6;"
-		"2:;"
+			 /* Push FPU state (if active) to our outgoing stack */
+			 "   mrs r8, control;" /* read CONTROL.FPCA */
+			 "   and r7, r8, #4;"  /* r7 == have_fpu */
+			 "   cbz r7, 1f;"
+			 "   bic r8, r8, #4;" /* clear CONTROL.FPCA */
+			 "   msr control, r8;"
+			 "   vmrs r6, fpscr;"
+			 "   push {r6};"
+			 "   vpush {s0-s31};"
+			 "1: push {r7};" /* have_fpu word */
+
+			 /* Pop FPU state (if present) from incoming frame in r4 */
+			 "   ldm r4!, {r7};" /* have_fpu word */
+			 "   cbz r7, 2f;"
+			 "   vldm r4!, {s0-s31};" /* (note: sets FPCA bit for us) */
+			 "   ldm r4!, {r6};"
+			 "   vmsr fpscr, r6;"
+			 "2:;"
 #endif
 
 #if defined(CONFIG_USERSPACE) && defined(CONFIG_USE_SWITCH)
-		"  ldr r8, =arm_m_switch_control;"
-		"  ldr r8, [r8];"
-		"  msr control, r8;"
+			 "  ldr r8, =arm_m_switch_control;"
+			 "  ldr r8, [r8];"
+			 "  msr control, r8;"
 #endif
 
-		/* Save the outgoing switch handle (which is SP), swap stacks,
-		 * and enable interrupts.  The restore process is
-		 * interruptible code (running in the incoming thread) once
-		 * the stack is valid.
-		 */
-		"str sp, [r5];"
-		"mov sp, r4;"
-		"msr basepri, r0;"
+			 /* Save the outgoing switch handle (which is SP), swap stacks,
+			  * and enable interrupts.  The restore process is
+			  * interruptible code (running in the incoming thread) once
+			  * the stack is valid.
+			  */
+			 "str sp, [r5];"
+			 "mov sp, r4;"
+			 "msr basepri, r0;"
 
 	/* Restore is super simple: pop the flags (and stack limit if
 	 * enabled) then slurp in the whole GPR set in two
@@ -202,27 +201,27 @@ static ALWAYS_INLINE void arm_m_switch(void *switch_to, void **switched_from)
 	 * both LR and PC in a single instruction)
 	 */
 #ifdef CONFIG_BUILTIN_STACK_GUARD
-		"pop {r1-r2};"
-		"msr psplim, r1;"
+			 "pop {r1-r2};"
+			 "msr psplim, r1;"
 #else
-		"pop {r2};"
+			 "pop {r2};"
 #endif
 #ifdef _ARM_M_SWITCH_HAVE_DSP
-		"msr apsr_nzcvqg, r2;" /* bonkers syntax */
+			 "msr apsr_nzcvqg, r2;" /* bonkers syntax */
 #else
-		"msr apsr_nzcvq, r2;" /* not even source-compatible! */
+			 "msr apsr_nzcvq, r2;" /* not even source-compatible! */
 #endif
-		"pop {r0-r12, lr};"
-		"pop {pc};"
+			 "pop {r0-r12, lr};"
+			 "pop {pc};"
 
-		"3:" /* Label for restore address */
-		 _R7_CLOBBER_OPT("pop {r7};")
-		::"r"(r4), "r"(r5)
-		 : "r6", "r8", "r9", "r10",
+			 "3:" /* Label for restore address */
+			 _R7_CLOBBER_OPT("pop {r7};")::"r"(r4),
+			 "r"(r5)
+			 : "r6", "r8", "r9", "r10",
 #ifndef CONFIG_ARM_GCC_FP_WORKAROUND
-		    "r7",
+			   "r7",
 #endif
-		    "r11");
+			   "r11");
 }
 
 #ifdef CONFIG_USE_SWITCH

From 89c64bf7bc6a287f231c47f83e72c99494aabc60 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 6 Aug 2025 19:51:57 -0700
Subject: [PATCH 23/27] soc/nxp: Update SOC-specific vector tables for PendSV
 removal

The z_arm_pendsv vector doesn't exist on USE_SWITCH=y builds

Signed-off-by: Andy Ross <andyross@google.com>
---
 soc/nxp/imxrt/imxrt5xx/cm33/soc.c | 8 +++++++-
 soc/nxp/imxrt/imxrt6xx/cm33/soc.c | 8 +++++++-
 soc/nxp/rw/soc.c                  | 8 +++++++-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/soc/nxp/imxrt/imxrt5xx/cm33/soc.c b/soc/nxp/imxrt/imxrt5xx/cm33/soc.c
index b826080323f12..38fb023600a21 100644
--- a/soc/nxp/imxrt/imxrt5xx/cm33/soc.c
+++ b/soc/nxp/imxrt/imxrt5xx/cm33/soc.c
@@ -99,6 +99,12 @@ extern void z_arm_pendsv(void);
 extern void sys_clock_isr(void);
 extern void z_arm_exc_spurious(void);
 
+#ifdef CONFIG_USE_SWITCH
+#define PENDSV_VEC z_arm_exc_spurious
+#else
+#define PENDSV_VEC z_arm_pendsv
+#endif
+
 __imx_boot_ivt_section void (*const image_vector_table[])(void) = {
 	(void (*)())(z_main_stack + CONFIG_MAIN_STACK_SIZE), /* 0x00 */
 	z_arm_reset,                                         /* 0x04 */
@@ -118,7 +124,7 @@ __imx_boot_ivt_section void (*const image_vector_table[])(void) = {
 	z_arm_svc,                      /* 0x2C */
 	z_arm_debug_monitor,            /* 0x30 */
 	(void (*)())image_vector_table, /* 0x34, imageLoadAddress. */
-	z_arm_pendsv,                   /* 0x38 */
+	PENDSV_VEC,                     /* 0x38 */
 #if defined(CONFIG_SYS_CLOCK_EXISTS) && defined(CONFIG_CORTEX_M_SYSTICK_INSTALL_ISR)
 	sys_clock_isr, /* 0x3C */
 #else
diff --git a/soc/nxp/imxrt/imxrt6xx/cm33/soc.c b/soc/nxp/imxrt/imxrt6xx/cm33/soc.c
index 68f5b9faf16db..4c65d79e44fcf 100644
--- a/soc/nxp/imxrt/imxrt6xx/cm33/soc.c
+++ b/soc/nxp/imxrt/imxrt6xx/cm33/soc.c
@@ -93,6 +93,12 @@ extern void z_arm_pendsv(void);
 extern void sys_clock_isr(void);
 extern void z_arm_exc_spurious(void);
 
+#ifdef CONFIG_USE_SWITCH
+#define PENDSV_VEC z_arm_exc_spurious
+#else
+#define PENDSV_VEC z_arm_pendsv
+#endif
+
 __imx_boot_ivt_section void (*const image_vector_table[])(void) = {
 	(void (*)())(z_main_stack + CONFIG_MAIN_STACK_SIZE), /* 0x00 */
 	z_arm_reset,                                         /* 0x04 */
@@ -112,7 +118,7 @@ __imx_boot_ivt_section void (*const image_vector_table[])(void) = {
 	z_arm_svc,                      /* 0x2C */
 	z_arm_debug_monitor,            /* 0x30 */
 	(void (*)())image_vector_table, /* 0x34, imageLoadAddress. */
-	z_arm_pendsv,                   /* 0x38 */
+	PENDSV_VEC,                     /* 0x38 */
 #if defined(CONFIG_SYS_CLOCK_EXISTS) && defined(CONFIG_CORTEX_M_SYSTICK_INSTALL_ISR)
 	sys_clock_isr, /* 0x3C */
 #else
diff --git a/soc/nxp/rw/soc.c b/soc/nxp/rw/soc.c
index 166c6008310e2..f5d024bd52d99 100644
--- a/soc/nxp/rw/soc.c
+++ b/soc/nxp/rw/soc.c
@@ -47,6 +47,12 @@ extern void z_arm_pendsv(void);
 extern void sys_clock_isr(void);
 extern void z_arm_exc_spurious(void);
 
+#ifdef CONFIG_USE_SWITCH
+#define PENDSV_VEC z_arm_exc_spurious
+#else
+#define PENDSV_VEC z_arm_pendsv
+#endif
+
 __imx_boot_ivt_section void (*const image_vector_table[])(void) = {
 	(void (*)())(z_main_stack + CONFIG_MAIN_STACK_SIZE), /* 0x00 */
 	z_arm_reset,                                         /* 0x04 */
@@ -66,7 +72,7 @@ __imx_boot_ivt_section void (*const image_vector_table[])(void) = {
 	z_arm_svc,                      /* 0x2C */
 	z_arm_debug_monitor,            /* 0x30 */
 	(void (*)())image_vector_table, /* 0x34, imageLoadAddress. */
-	z_arm_pendsv,                   /* 0x38 */
+	PENDSV_VEC,                     /* 0x38 */
 #if defined(CONFIG_SYS_CLOCK_EXISTS) && defined(CONFIG_CORTEX_M_SYSTICK_INSTALL_ISR)
 	sys_clock_isr, /* 0x3C */
 #else

From c01ca120b16b1ee18adc30c7268de82a0d84ff2e Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Wed, 6 Aug 2025 18:27:18 -0700
Subject: [PATCH 24/27] arch/arm: Avoid top-level asm() blocks

Some toolchains don't support an __asm__(...) block at the top level
of a file and require that they live within function scope.  That's
not a hardship as these two blocks were defining callable functions
anyway.  Exploit the "naked" attribute to avoid wasted bytes in unused
entry/exit code.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/arm-m-switch.c | 38 ++++++++++++++-------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/arch/arm/core/cortex_m/arm-m-switch.c b/arch/arm/core/cortex_m/arm-m-switch.c
index eaefc856a41d8..3d2f7fbcff05e 100644
--- a/arch/arm/core/cortex_m/arm-m-switch.c
+++ b/arch/arm/core/cortex_m/arm-m-switch.c
@@ -221,9 +221,10 @@ static bool fpu_state_pushed(uint32_t lr)
  * in the thread struct).  The overhead for the normal case is just a
  * few cycles for the test.
  */
-__asm__(".globl arm_m_iciit_stub;"
-	"arm_m_iciit_stub:;"
-	"  udf 0;");
+__attribute__((naked)) void arm_m_iciit_stub(void)
+{
+	__asm__("udf 0;");
+}
 
 /* Called out of interrupt entry to test for an interrupted instruction */
 static void iciit_fixup(struct k_thread *th, struct hw_frame_base *hw, uint32_t xpsr)
@@ -545,19 +546,20 @@ void arm_m_legacy_exit(void)
  * handled in software already.
  */
 #ifdef CONFIG_MULTITHREADING
-__asm__(".globl arm_m_exc_exit;"
-	"arm_m_exc_exit:;"
-	"  bl arm_m_must_switch;"
-	"  ldr r2, =arm_m_cs_ptrs;"
-	"  mov r3, #0;"
-	"  ldr lr, [r2, #8];" /* lr_save */
-	"  cbz r0, 1f;"
-	"  mov lr, #0xfffffffd;" /* integer-only LR */
-	"  ldm r2, {r0, r1};"    /* fields: out, in */
-	"  stm r0, {r4-r11};"    /* out is a switch_frame */
-	"  ldm r1!, {r7-r11};"   /* in is a synth_frame */
-	"  ldm r1, {r4-r6};"
-	"1:\n"
-	"  msr basepri, r3;" /* release lock taken in must_switch */
-	"  bx lr;");
+__attribute__((naked)) void arm_m_exc_exit(void)
+{
+	__asm__("  bl arm_m_must_switch;"
+		"  ldr r2, =arm_m_cs_ptrs;"
+		"  mov r3, #0;"
+		"  ldr lr, [r2, #8];" /* lr_save */
+		"  cbz r0, 1f;"
+		"  mov lr, #0xfffffffd;" /* integer-only LR */
+		"  ldm r2, {r0, r1};"    /* fields: out, in */
+		"  stm r0, {r4-r11};"    /* out is a switch_frame */
+		"  ldm r1!, {r7-r11};"   /* in is a synth_frame */
+		"  ldm r1, {r4-r6};"
+		"1:\n"
+		"  msr basepri, r3;" /* release lock taken in must_switch */
+		"  bx lr;");
+}
 #endif

From 14c00dbf013e54687e9e2dc58e89c8746f42fcda Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Sun, 10 Aug 2025 10:02:42 -0700
Subject: [PATCH 25/27] tests/arm_interrupt: Disable persistent lock test when
 USE_SWITCH=y

I'm at a loss here.  The feature this test case wants to see (on
ARMv7M, not ARMv6M) is the ability to take a irq_lock() inside a
system call and then see that future system calls from the same thread
continue to hold the lock.

That's not documented AFAICT.  It's also just a terrible idea because
either:

1. The obvious denial of service implications if user code is allowed
   to run in an unpreemptible mode, or:

2. The broken locking promise if this is implemented to release the
   lock and reacquire it in an attempt to avoid #1.

(FWIW: my read of the code is that #1 is the current implementation.
But hilariously the test isn't able to tell the difference!)

And in any case it's not how any of our other platforms work (or can
work, in some cases), making this a non-portable system call
API/feature at best.

Leave it in place for now out of conservatism, but disable with the
new arch_switch() code, whose behavior matches that of other Zephyr
userspaces.

Signed-off-by: Andy Ross <andyross@google.com>
---
 tests/arch/arm/arm_interrupt/src/arm_interrupt.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/tests/arch/arm/arm_interrupt/src/arm_interrupt.c b/tests/arch/arm/arm_interrupt/src/arm_interrupt.c
index 05535283be332..f47eac5646d93 100644
--- a/tests/arch/arm/arm_interrupt/src/arm_interrupt.c
+++ b/tests/arch/arm/arm_interrupt/src/arm_interrupt.c
@@ -371,8 +371,19 @@ void z_impl_test_arm_user_interrupt_syscall(void)
 #if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
 	/* Confirm IRQs are not locked */
 	zassert_false(__get_PRIMASK(), "PRIMASK is set\n");
-#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
-
+#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) && defined(CONFIG_USE_SWITCH)
+	/* Confirm IRQs are not locked */
+	zassert_false(__get_BASEPRI(), "BASEPRI is set\n");
+#elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) && !defined(CONFIG_USE_SWITCH)
+	/* The original ARMv7M userspace implementation had the
+	 * somewhat odd (and dangerous!) behavior that an interrupt
+	 * mask done in a system call would persist (!!) on return
+	 * into userspace, and be seen again once we were back in
+	 * kernel mode, and this is a test for it.  It's legacy
+	 * though, not something we ever documented, and not the way
+	 * userspace works (or even can work!) on other platforms, nor
+	 * with the newer arch_switch() code.  Consider removing.
+	 */
 	static bool first_call = 1;
 
 	if (first_call == 1) {

From 087243ee81afb87c10e3b45c30cc3f8c3a8ee21b Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Sun, 10 Aug 2025 12:13:21 -0700
Subject: [PATCH 26/27] arch/arm: Cleanup synchronization on userspace entry

The exit from the SVC exception used for syscalls back into the
calling thread is done without locking.  This means that the
intermediate states can be interrupted while the kernel-mode code is
still managing thread state like the mode bit, leading to mismatches.
This seems mostly robust when used with PendSV (though I'm a little
dubious), but the new arch_switch() code needs to be able to suspend
such an interrupted thread and restore it without going through a full
interrupt entry/exit again, so it needs locking for sure.

Take the lock unconditionally before exiting the call, and release it
in the thread once the magic is finished, just before calling the
handler.  Then take it again before swapping stacks and dropping
privilege.

Even then there is a one-cycle race where the interrupted thread has
dropped the lock but still has privilege (the nPRIV bit is clear in
CONTROL).  This thread will be resumed later WITHOUT privilege, which
means that trying to set CONTROL will fail.  So there's detection of
this 1-instruction race that will skip over it.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/arm-m-switch.c  | 38 +++++++++++++-----
 arch/arm/core/cortex_m/svc.S           |  8 ++++
 arch/arm/core/userspace.S              | 53 +++++++++++++-------------
 include/zephyr/arch/arm/arm-m-switch.h | 10 ++---
 4 files changed, 68 insertions(+), 41 deletions(-)

diff --git a/arch/arm/core/cortex_m/arm-m-switch.c b/arch/arm/core/cortex_m/arm-m-switch.c
index 3d2f7fbcff05e..a60ef9d69f4ff 100644
--- a/arch/arm/core/cortex_m/arm-m-switch.c
+++ b/arch/arm/core/cortex_m/arm-m-switch.c
@@ -487,6 +487,35 @@ bool arm_m_do_switch(struct k_thread *last_thread, void *next)
 	bool fpu = fpu_state_pushed((uint32_t)arm_m_cs_ptrs.lr_save);
 
 	__asm__ volatile("mrs %0, psp" : "=r"(last));
+
+#ifdef CONFIG_USERSPACE
+	/* Update CONTROL register's nPRIV bit to reflect user/syscall
+	 * thread state of the incoming thread.
+	 */
+	extern char z_syscall_exit_race1, z_syscall_exit_race2;
+	struct hw_frame_base *f = last;
+	uint32_t control;
+
+	/* Note that the privilege state is stored in the CPU *AND* in
+	 * the "mode" field of the thread struct.  This creates an
+	 * unavoidable race because the syscall exit code can't
+	 * atomically release the lock and set CONTROL.  We detect the
+	 * case where a thread is interrupted at exactly that moment
+	 * and manually skip over the MSR instruction (which would
+	 * otherwise fault on resume because "mode" says that the
+	 * thread is unprivileged!).  A future rework should just
+	 * pickle the CPU state found in the exception to the frame
+	 * and not mess with thread state from within the syscall.
+	 */
+	if (pc_match(f->pc, &z_syscall_exit_race1)) {
+		f->pc = (uint32_t) &z_syscall_exit_race2;
+	}
+
+	__asm__ volatile("mrs %0, control" : "=r"(control));
+	control = (control & ~1) | (_current->arch.mode & 1);
+	__asm__ volatile("msr control, %0" ::"r"(control));
+#endif
+
 	last = arm_m_cpu_to_switch(last_thread, last, fpu);
 	next = arm_m_switch_to_cpu(next);
 	__asm__ volatile("msr psp, %0" ::"r"(next));
@@ -508,15 +537,6 @@ bool arm_m_do_switch(struct k_thread *last_thread, void *next)
 	last_thread->switch_handle = last;
 #endif
 
-#ifdef CONFIG_USERSPACE
-	uint32_t control;
-
-	__asm__ volatile("mrs %0, control" : "=r"(control));
-	last_thread->arch.mode &= (~1) | (control & 1);
-	control = (control & ~1) | (_current->arch.mode & 1);
-	__asm__ volatile("msr control, %0" ::"r"(control));
-#endif
-
 #if defined(CONFIG_USERSPACE) || defined(CONFIG_MPU_STACK_GUARD)
 	z_arm_configure_dynamic_mpu_regions(_current);
 #endif
diff --git a/arch/arm/core/cortex_m/svc.S b/arch/arm/core/cortex_m/svc.S
index 34b4e1afd7ddc..3676e9dd580d2 100644
--- a/arch/arm/core/cortex_m/svc.S
+++ b/arch/arm/core/cortex_m/svc.S
@@ -261,6 +261,14 @@ SECTION_FUNC(TEXT, z_arm_svc)
     msr PSPLIM, r1
 #endif /* CONFIG_BUILTIN_STACK_GUARD */
 
+    /* Take lock before returning from svc */
+#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+    mov r0, #_EXC_IRQ_DEFAULT_PRIO
+    msr BASEPRI, r0
+#else
+    cpsid i
+#endif
+
     /* return from SVC to the modified LR - z_arm_do_syscall */
     bx lr
 #endif /* CONFIG_USERSPACE */
diff --git a/arch/arm/core/userspace.S b/arch/arm/core/userspace.S
index 3594940078da3..4576907948703 100644
--- a/arch/arm/core/userspace.S
+++ b/arch/arm/core/userspace.S
@@ -26,11 +26,21 @@ GTEXT(arch_user_string_nlen)
 GTEXT(z_arm_user_string_nlen_fault_start)
 GTEXT(z_arm_user_string_nlen_fault_end)
 GTEXT(z_arm_user_string_nlen_fixup)
+GTEXT(z_syscall_exit_race1)
+GTEXT(z_syscall_exit_race2)
 GDATA(_kernel)
 
 /* Imports */
 GDATA(_k_syscall_table)
 
+#if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
+# define LOCK(reg)   mov reg, #_EXC_IRQ_DEFAULT_PRIO ; msr BASEPRI, reg
+# define UNLOCK(reg) mov reg, #0 ; msr BASEPRI, reg
+#else
+# define LOCK(unused)   cpsid i
+# define UNLOCK(unused) cpsie i
+#endif
+
 /**
  *
  * User space entry function
@@ -47,7 +57,9 @@ SECTION_FUNC(TEXT,z_arm_userspace_enter)
     /* move user_entry to lr */
     mov lr, r0
 
-    /* prepare to set stack to privileged stack */
+    LOCK(r0)
+
+    /* prepare to set stack to privilege */
     ldr r0, =_kernel
     ldr r0, [r0, #_kernel_offset_to_current]
 #if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
@@ -213,15 +225,8 @@ SECTION_FUNC(TEXT,z_arm_userspace_enter)
      * large enough to accommodate a maximum exception stack frame.
      */
 
-    /* Temporarily store current IRQ locking status in ip */
-    mrs ip, BASEPRI
     push {r0, ip}
 
-    /* Lock PendSV while reprogramming PSP and PSPLIM */
-    mov r0, #_EXC_PENDSV_PRIO_MASK
-    msr BASEPRI_MAX, r0
-    isb
-
     /* Set PSPLIM to guard the thread's user stack. */
     ldr r0, =_kernel
     ldr r0, [r0, #_kernel_offset_to_current]
@@ -235,11 +240,7 @@ SECTION_FUNC(TEXT,z_arm_userspace_enter)
     msr PSP, r0
 #endif
 
-#if defined(CONFIG_BUILTIN_STACK_GUARD)
-    /* Restore interrupt lock status */
-    msr BASEPRI, ip
-    isb
-#endif
+    UNLOCK(r0)
 
     /* restore r0 */
     mov r0, lr
@@ -394,6 +395,8 @@ SECTION_FUNC(TEXT, z_arm_do_syscall)
     msr PSPLIM, ip
 #endif
 
+    UNLOCK(ip)
+
     /*
      * r0-r5 contain arguments
      * r6 contains call_id
@@ -478,6 +481,7 @@ dispatch_syscall:
     ldr lr, [sp,#16]
 #endif
 
+    LOCK(ip)
 
 #if defined(CONFIG_BUILTIN_STACK_GUARD)
     /*
@@ -497,14 +501,6 @@ dispatch_syscall:
      * large enough to accommodate a maximum exception stack frame.
      */
 
-    /* Temporarily store current IRQ locking status in r2 */
-    mrs r2, BASEPRI
-
-    /* Lock PendSV while reprogramming PSP and PSPLIM */
-    mov r3, #_EXC_PENDSV_PRIO_MASK
-    msr BASEPRI_MAX, r3
-    isb
-
     /* Set PSPLIM to guard the thread's user stack. */
     ldr r3, =_kernel
     ldr r3, [r3, #_kernel_offset_to_current]
@@ -527,12 +523,6 @@ dispatch_syscall:
     msr PSP, ip
 #endif
 
-#if defined(CONFIG_BUILTIN_STACK_GUARD)
-    /* Restore interrupt lock status */
-    msr BASEPRI, r2
-    isb
-#endif
-
     push {r0, r1}
 #if defined(CONFIG_ARMV6_M_ARMV8_M_BASELINE)
     push {r2, r3}
@@ -548,6 +538,7 @@ dispatch_syscall:
     /* drop privileges by setting bit 0 in CONTROL */
     mrs r2, CONTROL
     orrs r2, r2, r3
+
     msr CONTROL, r2
     pop {r2, r3}
 #elif defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE) \
@@ -559,11 +550,19 @@ dispatch_syscall:
     /* Store (unprivileged) mode in thread's mode state variable */
     str r1, [r0, #_thread_offset_to_mode]
     dsb
+
 #if defined(CONFIG_ARMV7_M_ARMV8_M_MAINLINE)
     /* drop privileges by setting bit 0 in CONTROL */
     mrs ip, CONTROL
     orrs ip, ip, #1
+
+    /* drop lock immediately before setting CONTROL, see comments in arm-m-switch.c */
+    UNLOCK(r0)
+z_syscall_exit_race1:
     msr CONTROL, ip
+z_syscall_exit_race2:
+#else
+    UNLOCK(r0)
 #endif
 #endif
 
diff --git a/include/zephyr/arch/arm/arm-m-switch.h b/include/zephyr/arch/arm/arm-m-switch.h
index 2780425b34e66..2bd9acf6b9aca 100644
--- a/include/zephyr/arch/arm/arm-m-switch.h
+++ b/include/zephyr/arch/arm/arm-m-switch.h
@@ -113,17 +113,14 @@ static ALWAYS_INLINE void arm_m_switch(void *switch_to, void **switched_from)
 #endif
 
 #if defined(CONFIG_USERSPACE) && defined(CONFIG_USE_SWITCH)
-	/* Need to manage CONTROL.nPRIV bit.  We know the outgoing
+	/* Set things up to write the CONTROL.nPRIV bit.  We know the outgoing
 	 * thread is in privileged mode (because you can't reach a
 	 * context switch unless you're in the kernel!).
 	 */
 	extern uint32_t arm_m_switch_control;
 	uint32_t control;
-	struct k_thread *old = CONTAINER_OF(switched_from, struct k_thread, switch_handle);
 
-	old->arch.mode &= ~1;
 	__asm__ volatile("mrs %0, control" : "=r"(control));
-	__ASSERT_NO_MSG((control & 1) == 0);
 	arm_m_switch_control = (control & ~1) | (_current->arch.mode & 1);
 #endif
 
@@ -183,7 +180,6 @@ static ALWAYS_INLINE void arm_m_switch(void *switch_to, void **switched_from)
 #if defined(CONFIG_USERSPACE) && defined(CONFIG_USE_SWITCH)
 			 "  ldr r8, =arm_m_switch_control;"
 			 "  ldr r8, [r8];"
-			 "  msr control, r8;"
 #endif
 
 			 /* Save the outgoing switch handle (which is SP), swap stacks,
@@ -195,6 +191,10 @@ static ALWAYS_INLINE void arm_m_switch(void *switch_to, void **switched_from)
 			 "mov sp, r4;"
 			 "msr basepri, r0;"
 
+#if defined(CONFIG_USERSPACE) && defined(CONFIG_USE_SWITCH)
+			 "  msr control, r8;" /* Now we can drop privilege */
+#endif
+
 	/* Restore is super simple: pop the flags (and stack limit if
 	 * enabled) then slurp in the whole GPR set in two
 	 * instructions. (The instruction encoding disallows popping

From f2f356a64c2df77a98ac24c189c2a0834a7d915a Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Mon, 11 Aug 2025 10:37:45 -0700
Subject: [PATCH 27/27] arch/arm: Work around FVP stkalign glitch

The ARM Ltd. FVP emulator (at least the variants run in Zephyr CI)
appears to have a bug with the stack alignment bit in xPSR.  It's
common (it fails in the first 4-6 timer interrupts in
tests.syscalls.timeslicing) that we'll take an interrupt from a
seemingly aligned (!) stack with the bit set.  If we then switch and
resume the thread from a different context later, popping the stack
goes wrong (more so than just a misalignment of four bytes: I usually
see it too low by 20 bytes) in a way that it doesn't if we return
synchronously.  Presumably legacy PendSV didn't see this because it
used the unmodified exception frame.

Work around this by simply assuming all interrupted stacks were
aligned and clearing the bit.  That is NOT correct in the general
case, but in practice it's enough to get tests to pass.

Signed-off-by: Andy Ross <andyross@google.com>
---
 arch/arm/core/cortex_m/arm-m-switch.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/arch/arm/core/cortex_m/arm-m-switch.c b/arch/arm/core/cortex_m/arm-m-switch.c
index a60ef9d69f4ff..7040907d48f5c 100644
--- a/arch/arm/core/cortex_m/arm-m-switch.c
+++ b/arch/arm/core/cortex_m/arm-m-switch.c
@@ -544,6 +544,31 @@ bool arm_m_do_switch(struct k_thread *last_thread, void *next)
 #ifdef CONFIG_THREAD_LOCAL_STORAGE
 	z_arm_tls_ptr = _current->tls;
 #endif
+
+#if defined(CONFIG_BOARD_MPS3_CORSTONE310_FVP) || \
+	defined(CONFIG_BOARD_MPS4_CORSTONE315_FVP) || \
+	defined(CONFIG_BOARD_MPS4_CORSTONE320_FVP)
+	/* The ARM Ltd. FVP emulator (at least the ones above that run
+	 * in Zephyr CI) appears to have a bug with the stack
+	 * alignment bit in xPSR.  It's common (it fails in the first
+	 * 4-6 timer interrupts in tests.syscalls.timeslicing) that
+	 * we'll take an interrupt from a seemingly aligned (!) stack
+	 * with the bit set.  If we then switch and resume the thread
+	 * from a different context later, popping the stack goes
+	 * wrong (more so than just a misalignment of four bytes: I
+	 * usually see it too low by 20 bytes) in a way that it
+	 * doesn't if we return synchronously.  Presumably legacy
+	 * PendSV didn't see this because it used the unmodified
+	 * exception frame.
+	 *
+	 * Work around this here, pending validation from ARM, by
+	 * simply assuming all interrupted stacks were aligned and
+	 * clearing the bit.  That is NOT correct in the general case,
+	 * but in practice it's enough to get tests to pass.
+	 */
+	((struct hw_frame_base *)next)->apsr &= ~BIT(9);
+#endif
+
 	return true;
 }