From 6d8c1073448ede2e646d7809fe2c6c8a06e56687 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 16 Oct 2025 09:56:40 -0700 Subject: [PATCH 1/7] asm-generic: barrier: Add smp_cond_load_relaxed_timeout() Add smp_cond_load_relaxed_timeout(), which extends smp_cond_load_relaxed() to allow waiting for a duration. The waiting loop uses cpu_poll_relax() to wait on the condition variable with a periodic evaluation of a time-check. cpu_poll_relax() unless overridden by the arch code, amounts to a cpu_relax(). The number of times we spin is defined by SMP_TIMEOUT_POLL_COUNT (chosen to be 200 by default) which, assuming each cpu_poll_relax() iteration takes around 20-30 cycles (measured on a variety of x86 platforms), for a total of ~4000-6000 cycles. Cc: Arnd Bergmann Cc: Will Deacon Cc: Catalin Marinas Cc: Peter Zijlstra Cc: linux-arch@vger.kernel.org Reviewed-by: Catalin Marinas Reviewed-by: Haris Okanovic Tested-by: Haris Okanovic Signed-off-by: Ankur Arora --- include/asm-generic/barrier.h | 41 +++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index d4f581c1e21da..0063b46ec0657 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -273,6 +273,47 @@ do { \ }) #endif +#ifndef SMP_TIMEOUT_POLL_COUNT +#define SMP_TIMEOUT_POLL_COUNT 200 +#endif + +#ifndef cpu_poll_relax +#define cpu_poll_relax(ptr, val) cpu_relax() +#endif + +/** + * smp_cond_load_relaxed_timeout() - (Spin) wait for cond with no ordering + * guarantees until a timeout expires. + * @ptr: pointer to the variable to wait on + * @cond: boolean expression to wait for + * @time_check_expr: expression to decide when to bail out + * + * Equivalent to using READ_ONCE() on the condition variable. + */ +#ifndef smp_cond_load_relaxed_timeout +#define smp_cond_load_relaxed_timeout(ptr, cond_expr, time_check_expr) \ +({ \ + typeof(ptr) __PTR = (ptr); \ + __unqual_scalar_typeof(*ptr) VAL; \ + u32 __n = 0, __spin = SMP_TIMEOUT_POLL_COUNT; \ + \ + for (;;) { \ + VAL = READ_ONCE(*__PTR); \ + if (cond_expr) \ + break; \ + cpu_poll_relax(__PTR, VAL); \ + if (++__n < __spin) \ + continue; \ + if (time_check_expr) { \ + VAL = READ_ONCE(*__PTR); \ + break; \ + } \ + __n = 0; \ + } \ + (typeof(*ptr))VAL; \ +}) +#endif + /* * pmem_wmb() ensures that all stores for which the modification * are written to persistent storage by preceding instructions have From 6718946a63a3458d544d2f1da9879f516fdec61d Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 16 Oct 2025 09:56:41 -0700 Subject: [PATCH 2/7] arm64: barrier: support smp_cond_load_relaxed_timeout() Support waiting in smp_cond_load_relaxed_timeout() via __cmpwait_relaxed(). Limit this to when the event-stream is enabled, to ensure that we wake from WFE periodically and don't block forever if there are no stores to the cacheline. In the unlikely event that the event-stream is unavailable, fallback to spin-waiting. Also set SMP_TIMEOUT_POLL_COUNT to 1 so we do the time-check for each iteration in smp_cond_load_relaxed_timeout(). Cc: linux-arm-kernel@lists.infradead.org Cc: Catalin Marinas Suggested-by: Will Deacon Signed-off-by: Ankur Arora --- arch/arm64/include/asm/barrier.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index f5801b0ba9e9e..92c16dfb8ca6c 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -219,6 +219,19 @@ do { \ (typeof(*ptr))VAL; \ }) +#define SMP_TIMEOUT_POLL_COUNT 1 + +/* Re-declared here to avoid include dependency. */ +extern bool arch_timer_evtstrm_available(void); + +#define cpu_poll_relax(ptr, val) \ +do { \ + if (arch_timer_evtstrm_available()) \ + __cmpwait_relaxed(ptr, val); \ + else \ + cpu_relax(); \ +} while (0) + #include #endif /* __ASSEMBLY__ */ From 83e87f8112a7361c26fe481b4ea2ac29afde273b Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 16 Oct 2025 09:56:42 -0700 Subject: [PATCH 3/7] arm64: rqspinlock: Remove private copy of smp_cond_load_acquire_timewait In preparation for defining smp_cond_load_acquire_timeout(), remove the private copy. Lacking this, the rqspinlock code falls back to using smp_cond_load_acquire(). Cc: Kumar Kartikeya Dwivedi Cc: Alexei Starovoitov Reviewed-by: Catalin Marinas Reviewed-by: Haris Okanovic Signed-off-by: Ankur Arora --- arch/arm64/include/asm/rqspinlock.h | 85 ----------------------------- 1 file changed, 85 deletions(-) diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h index 9ea0a74e58927..a385603436e98 100644 --- a/arch/arm64/include/asm/rqspinlock.h +++ b/arch/arm64/include/asm/rqspinlock.h @@ -3,91 +3,6 @@ #define _ASM_RQSPINLOCK_H #include - -/* - * Hardcode res_smp_cond_load_acquire implementations for arm64 to a custom - * version based on [0]. In rqspinlock code, our conditional expression involves - * checking the value _and_ additionally a timeout. However, on arm64, the - * WFE-based implementation may never spin again if no stores occur to the - * locked byte in the lock word. As such, we may be stuck forever if - * event-stream based unblocking is not available on the platform for WFE spin - * loops (arch_timer_evtstrm_available). - * - * Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this - * copy-paste. - * - * While we rely on the implementation to amortize the cost of sampling - * cond_expr for us, it will not happen when event stream support is - * unavailable, time_expr check is amortized. This is not the common case, and - * it would be difficult to fit our logic in the time_expr_ns >= time_limit_ns - * comparison, hence just let it be. In case of event-stream, the loop is woken - * up at microsecond granularity. - * - * [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com - */ - -#ifndef smp_cond_load_acquire_timewait - -#define smp_cond_time_check_count 200 - -#define __smp_cond_load_relaxed_spinwait(ptr, cond_expr, time_expr_ns, \ - time_limit_ns) ({ \ - typeof(ptr) __PTR = (ptr); \ - __unqual_scalar_typeof(*ptr) VAL; \ - unsigned int __count = 0; \ - for (;;) { \ - VAL = READ_ONCE(*__PTR); \ - if (cond_expr) \ - break; \ - cpu_relax(); \ - if (__count++ < smp_cond_time_check_count) \ - continue; \ - if ((time_expr_ns) >= (time_limit_ns)) \ - break; \ - __count = 0; \ - } \ - (typeof(*ptr))VAL; \ -}) - -#define __smp_cond_load_acquire_timewait(ptr, cond_expr, \ - time_expr_ns, time_limit_ns) \ -({ \ - typeof(ptr) __PTR = (ptr); \ - __unqual_scalar_typeof(*ptr) VAL; \ - for (;;) { \ - VAL = smp_load_acquire(__PTR); \ - if (cond_expr) \ - break; \ - __cmpwait_relaxed(__PTR, VAL); \ - if ((time_expr_ns) >= (time_limit_ns)) \ - break; \ - } \ - (typeof(*ptr))VAL; \ -}) - -#define smp_cond_load_acquire_timewait(ptr, cond_expr, \ - time_expr_ns, time_limit_ns) \ -({ \ - __unqual_scalar_typeof(*ptr) _val; \ - int __wfe = arch_timer_evtstrm_available(); \ - \ - if (likely(__wfe)) { \ - _val = __smp_cond_load_acquire_timewait(ptr, cond_expr, \ - time_expr_ns, \ - time_limit_ns); \ - } else { \ - _val = __smp_cond_load_relaxed_spinwait(ptr, cond_expr, \ - time_expr_ns, \ - time_limit_ns); \ - smp_acquire__after_ctrl_dep(); \ - } \ - (typeof(*ptr))_val; \ -}) - -#endif - -#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire_timewait(v, c, 0, 1) - #include #endif /* _ASM_RQSPINLOCK_H */ From 7baf042d339b11f240d092adcb1c555d8c63ce83 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 16 Oct 2025 09:56:43 -0700 Subject: [PATCH 4/7] asm-generic: barrier: Add smp_cond_load_acquire_timeout() Add the acquire variant of smp_cond_load_relaxed_timeout(). This reuses the relaxed variant, with an additional LOAD->LOAD ordering. Cc: Arnd Bergmann Cc: Will Deacon Cc: Catalin Marinas Cc: Peter Zijlstra Cc: linux-arch@vger.kernel.org Reviewed-by: Catalin Marinas Reviewed-by: Haris Okanovic Tested-by: Haris Okanovic Signed-off-by: Ankur Arora --- include/asm-generic/barrier.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 0063b46ec0657..9a218f558c5c9 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -314,6 +314,28 @@ do { \ }) #endif +/** + * smp_cond_load_acquire_timeout() - (Spin) wait for cond with ACQUIRE ordering + * until a timeout expires. + * + * Arguments: same as smp_cond_load_relaxed_timeout(). + * + * Equivalent to using smp_cond_load_acquire() on the condition variable with + * a timeout. + */ +#ifndef smp_cond_load_acquire_timeout +#define smp_cond_load_acquire_timeout(ptr, cond_expr, time_check_expr) \ +({ \ + __unqual_scalar_typeof(*ptr) _val; \ + _val = smp_cond_load_relaxed_timeout(ptr, cond_expr, \ + time_check_expr); \ + \ + /* Depends on the control dependency of the wait above. */ \ + smp_acquire__after_ctrl_dep(); \ + (typeof(*ptr))_val; \ +}) +#endif + /* * pmem_wmb() ensures that all stores for which the modification * are written to persistent storage by preceding instructions have From 1876a0ffc800bb3d40efc739b7c9add7b87eb0cd Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 16 Oct 2025 09:56:44 -0700 Subject: [PATCH 5/7] atomic: Add atomic_cond_read_*_timeout() Add atomic_cond_read_*_timeout() and, atomic64_cond_read_*_timeout(), to provide atomic load wrappers around the cond-load timeout interfaces. Cc: Will Deacon Cc: Peter Zijlstra Cc: Boqun Feng Signed-off-by: Ankur Arora --- include/linux/atomic.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 8dd57c3a99e9b..b3f77a89e9e1b 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -31,6 +31,14 @@ #define atomic64_cond_read_acquire(v, c) smp_cond_load_acquire(&(v)->counter, (c)) #define atomic64_cond_read_relaxed(v, c) smp_cond_load_relaxed(&(v)->counter, (c)) +#define atomic_cond_read_acquire_timeout(v, c, t) \ + smp_cond_load_acquire_timeout(&(v)->counter, (c), (t)) +#define atomic_cond_read_relaxed_timeout(v, c, t) \ + smp_cond_load_relaxed_timeout(&(v)->counter, (c), (t)) + +#define atomic64_cond_read_acquire_timeout(v, c) smp_cond_load_acquire_timeout(&(v)->counter, (c)) +#define atomic64_cond_read_relaxed_timeout(v, c) smp_cond_load_relaxed_timeout(&(v)->counter, (c)) + /* * The idea here is to build acquire/release variants by adding explicit * barriers on top of the relaxed variant. In the case where the relaxed From 27eaa307db699b330b8e54e1d918fa5704f52d5f Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 16 Oct 2025 09:56:45 -0700 Subject: [PATCH 6/7] rqspinlock: use smp_cond_load_acquire_timeout() Switch out the conditional load interfaces used by rqspinlock to atomic_cond_read_acquire_timeout() and, smp_cond_read_acquire_timeout(). Both these handle the timeout and amortize as needed, so use check_timeout() directly. Also, when using spin-wait implementations, redefine SMP_TIMEOUT_POLL_COUNT to be 16k to be similar to the spin-count used in RES_CHECK_TIMEOUT(). Cc: Kumar Kartikeya Dwivedi Cc: Alexei Starovoitov Signed-off-by: Ankur Arora --- kernel/bpf/rqspinlock.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 21be48108e962..934439bdc423d 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -238,20 +238,14 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, } /* - * Do not amortize with spins when res_smp_cond_load_acquire is defined, - * as the macro does internal amortization for us. + * Amortize timeout check for busy-wait loops. */ -#ifndef res_smp_cond_load_acquire #define RES_CHECK_TIMEOUT(ts, ret, mask) \ ({ \ if (!(ts).spin++) \ (ret) = check_timeout((lock), (mask), &(ts)); \ (ret); \ }) -#else -#define RES_CHECK_TIMEOUT(ts, ret, mask) \ - ({ (ret) = check_timeout((lock), (mask), &(ts)); }) -#endif /* * Initialize the 'spin' member. @@ -265,6 +259,15 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, */ #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) +/* + * Limit how often check_timeout() is invoked while spin-waiting by + * smp_cond_load_acquire_timeout() or, atomic_cond_read_acquire_timeout(). + */ +#ifndef CONFIG_ARM64 +#undef SMP_TIMEOUT_POLL_COUNT +#define SMP_TIMEOUT_POLL_COUNT (16*1024) +#endif + /* * Provide a test-and-set fallback for cases when queued spin lock support is * absent from the architecture. @@ -310,12 +313,6 @@ EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); */ static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); -#ifndef res_smp_cond_load_acquire -#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) -#endif - -#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) - /** * resilient_queued_spin_lock_slowpath - acquire the queued spinlock * @lock: Pointer to queued spinlock structure @@ -415,7 +412,8 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) */ if (val & _Q_LOCKED_MASK) { RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); - res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); + smp_cond_load_acquire_timeout(&lock->locked, !VAL, + (ret = check_timeout(lock, _Q_LOCKED_MASK, &ts))); } if (ret) { @@ -569,9 +567,8 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) * us. */ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); - val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || - RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); - + val = atomic_cond_read_acquire_timeout(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), + (ret = check_timeout(lock, _Q_LOCKED_PENDING_MASK, &ts))); waitq_timeout: if (ret) { /* From fe4d64a08863944303b6857b3ff7dbf06c149d98 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 16 Oct 2025 09:56:46 -0700 Subject: [PATCH 7/7] cpuidle/poll_state: poll via smp_cond_load_relaxed_timeout() The inner loop in poll_idle() polls over the thread_info flags, waiting to see if the thread has TIF_NEED_RESCHED set. The loop exits once the condition is met, or if the poll time limit has been exceeded. To minimize the number of instructions executed in each iteration, the time check is done only intermittently (once every POLL_IDLE_RELAX_COUNT iterations). In addition, each loop iteration executes cpu_relax() which on certain platforms provides a hint to the pipeline that the loop busy-waits, allowing the processor to reduce power consumption. This is close to what smp_cond_load_relaxed_timeout() provides. So, restructure the loop and fold the loop condition and the timeout check in smp_cond_load_relaxed_timeout(). Cc: "Rafael J. Wysocki" Cc: Daniel Lezcano Signed-off-by: Ankur Arora --- drivers/cpuidle/poll_state.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c index 9b6d90a726019..72d048c8ae7f5 100644 --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -8,36 +8,23 @@ #include #include -#define POLL_IDLE_RELAX_COUNT 200 - static int __cpuidle poll_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - u64 time_start; - - time_start = local_clock_noinstr(); + u64 time_end; + u32 flags = 0; dev->poll_time_limit = false; + time_end = local_clock_noinstr() + cpuidle_poll_time(drv, dev); + raw_local_irq_enable(); - if (!current_set_polling_and_test()) { - unsigned int loop_count = 0; - u64 limit; - - limit = cpuidle_poll_time(drv, dev); - - while (!need_resched()) { - cpu_relax(); - if (loop_count++ < POLL_IDLE_RELAX_COUNT) - continue; - - loop_count = 0; - if (local_clock_noinstr() - time_start > limit) { - dev->poll_time_limit = true; - break; - } - } - } + if (!current_set_polling_and_test()) + flags = smp_cond_load_relaxed_timeout(¤t_thread_info()->flags, + (VAL & _TIF_NEED_RESCHED), + (local_clock_noinstr() >= time_end)); + dev->poll_time_limit = !(flags & _TIF_NEED_RESCHED); + raw_local_irq_disable(); current_clr_polling();