Skip to content

Commit b2b00dd

Browse files
committed
rcu: React to callback overload by aggressively seeking quiescent states
In default configutions, RCU currently waits at least 100 milliseconds before asking cond_resched() and/or resched_rcu() for help seeking quiescent states to end a grace period. But 100 milliseconds can be one good long time during an RCU callback flood, for example, as can happen when user processes repeatedly open and close files in a tight loop. These 100-millisecond gaps in successive grace periods during a callback flood can result in excessive numbers of callbacks piling up, unnecessarily increasing memory footprint. This commit therefore asks cond_resched() and/or resched_rcu() for help as early as the first FQS scan when at least one of the CPUs has more than 20,000 callbacks queued, a number that can be changed using the new rcutree.qovld kernel boot parameter. An auxiliary qovld_calc variable is used to avoid acquisition of locks that have not yet been initialized. Early tests indicate that this reduces the RCU-callback memory footprint during rcutorture floods by from 50% to 4x, depending on configuration. Reported-by: Joel Fernandes (Google) <[email protected]> Reported-by: Tejun Heo <[email protected]> [ paulmck: Fix bug located by Qian Cai. ] Signed-off-by: Paul E. McKenney <[email protected]> Tested-by: Dexuan Cui <[email protected]> Tested-by: Qian Cai <[email protected]>
1 parent b5ea037 commit b2b00dd

File tree

4 files changed

+86
-4
lines changed

4 files changed

+86
-4
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3980,6 +3980,15 @@
39803980
Set threshold of queued RCU callbacks below which
39813981
batch limiting is re-enabled.
39823982

3983+
rcutree.qovld= [KNL]
3984+
Set threshold of queued RCU callbacks beyond which
3985+
RCU's force-quiescent-state scan will aggressively
3986+
enlist help from cond_resched() and sched IPIs to
3987+
help CPUs more quickly reach quiescent states.
3988+
Set to less than zero to make this be set based
3989+
on rcutree.qhimark at boot time and to zero to
3990+
disable more aggressive help enlistment.
3991+
39833992
rcutree.rcu_idle_gp_delay= [KNL]
39843993
Set wakeup interval for idle CPUs that have
39853994
RCU callbacks (RCU_FAST_NO_HZ=y).

kernel/rcu/tree.c

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
150150
static void invoke_rcu_core(void);
151151
static void rcu_report_exp_rdp(struct rcu_data *rdp);
152152
static void sync_sched_exp_online_cleanup(int cpu);
153+
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
153154

154155
/* rcuc/rcub kthread realtime priority */
155156
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
@@ -410,10 +411,15 @@ static long blimit = DEFAULT_RCU_BLIMIT;
410411
static long qhimark = DEFAULT_RCU_QHIMARK;
411412
#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
412413
static long qlowmark = DEFAULT_RCU_QLOMARK;
414+
#define DEFAULT_RCU_QOVLD_MULT 2
415+
#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
416+
static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */
417+
static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */
413418

414419
module_param(blimit, long, 0444);
415420
module_param(qhimark, long, 0444);
416421
module_param(qlowmark, long, 0444);
422+
module_param(qovld, long, 0444);
417423

418424
static ulong jiffies_till_first_fqs = ULONG_MAX;
419425
static ulong jiffies_till_next_fqs = ULONG_MAX;
@@ -1072,7 +1078,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
10721078
rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
10731079
if (!READ_ONCE(*rnhqp) &&
10741080
(time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
1075-
time_after(jiffies, rcu_state.jiffies_resched))) {
1081+
time_after(jiffies, rcu_state.jiffies_resched) ||
1082+
rcu_state.cbovld)) {
10761083
WRITE_ONCE(*rnhqp, true);
10771084
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
10781085
smp_store_release(ruqp, true);
@@ -1089,8 +1096,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
10891096
* So hit them over the head with the resched_cpu() hammer!
10901097
*/
10911098
if (tick_nohz_full_cpu(rdp->cpu) &&
1092-
time_after(jiffies,
1093-
READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) {
1099+
(time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
1100+
rcu_state.cbovld)) {
10941101
WRITE_ONCE(*ruqp, true);
10951102
resched_cpu(rdp->cpu);
10961103
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
@@ -1704,8 +1711,9 @@ static void rcu_gp_fqs_loop(void)
17041711
*/
17051712
static void rcu_gp_cleanup(void)
17061713
{
1707-
unsigned long gp_duration;
1714+
int cpu;
17081715
bool needgp = false;
1716+
unsigned long gp_duration;
17091717
unsigned long new_gp_seq;
17101718
bool offloaded;
17111719
struct rcu_data *rdp;
@@ -1751,6 +1759,12 @@ static void rcu_gp_cleanup(void)
17511759
needgp = __note_gp_changes(rnp, rdp) || needgp;
17521760
/* smp_mb() provided by prior unlock-lock pair. */
17531761
needgp = rcu_future_gp_cleanup(rnp) || needgp;
1762+
// Reset overload indication for CPUs no longer overloaded
1763+
if (rcu_is_leaf_node(rnp))
1764+
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) {
1765+
rdp = per_cpu_ptr(&rcu_data, cpu);
1766+
check_cb_ovld_locked(rdp, rnp);
1767+
}
17541768
sq = rcu_nocb_gp_get(rnp);
17551769
raw_spin_unlock_irq_rcu_node(rnp);
17561770
rcu_nocb_gp_cleanup(sq);
@@ -2299,10 +2313,13 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
22992313
struct rcu_data *rdp;
23002314
struct rcu_node *rnp;
23012315

2316+
rcu_state.cbovld = rcu_state.cbovldnext;
2317+
rcu_state.cbovldnext = false;
23022318
rcu_for_each_leaf_node(rnp) {
23032319
cond_resched_tasks_rcu_qs();
23042320
mask = 0;
23052321
raw_spin_lock_irqsave_rcu_node(rnp, flags);
2322+
rcu_state.cbovldnext |= !!rnp->cbovldmask;
23062323
if (rnp->qsmask == 0) {
23072324
if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||
23082325
rcu_preempt_blocked_readers_cgp(rnp)) {
@@ -2583,6 +2600,48 @@ static void rcu_leak_callback(struct rcu_head *rhp)
25832600
{
25842601
}
25852602

2603+
/*
2604+
* Check and if necessary update the leaf rcu_node structure's
2605+
* ->cbovldmask bit corresponding to the current CPU based on that CPU's
2606+
* number of queued RCU callbacks. The caller must hold the leaf rcu_node
2607+
* structure's ->lock.
2608+
*/
2609+
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp)
2610+
{
2611+
raw_lockdep_assert_held_rcu_node(rnp);
2612+
if (qovld_calc <= 0)
2613+
return; // Early boot and wildcard value set.
2614+
if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc)
2615+
WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask);
2616+
else
2617+
WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask);
2618+
}
2619+
2620+
/*
2621+
* Check and if necessary update the leaf rcu_node structure's
2622+
* ->cbovldmask bit corresponding to the current CPU based on that CPU's
2623+
* number of queued RCU callbacks. No locks need be held, but the
2624+
* caller must have disabled interrupts.
2625+
*
2626+
* Note that this function ignores the possibility that there are a lot
2627+
* of callbacks all of which have already seen the end of their respective
2628+
* grace periods. This omission is due to the need for no-CBs CPUs to
2629+
* be holding ->nocb_lock to do this check, which is too heavy for a
2630+
* common-case operation.
2631+
*/
2632+
static void check_cb_ovld(struct rcu_data *rdp)
2633+
{
2634+
struct rcu_node *const rnp = rdp->mynode;
2635+
2636+
if (qovld_calc <= 0 ||
2637+
((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) ==
2638+
!!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask)))
2639+
return; // Early boot wildcard value or already set correctly.
2640+
raw_spin_lock_rcu_node(rnp);
2641+
check_cb_ovld_locked(rdp, rnp);
2642+
raw_spin_unlock_rcu_node(rnp);
2643+
}
2644+
25862645
/*
25872646
* Helper function for call_rcu() and friends. The cpu argument will
25882647
* normally be -1, indicating "currently running CPU". It may specify
@@ -2626,6 +2685,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
26262685
rcu_segcblist_init(&rdp->cblist);
26272686
}
26282687

2688+
check_cb_ovld(rdp);
26292689
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
26302690
return; // Enqueued onto ->nocb_bypass, so just leave.
26312691
/* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
@@ -3814,6 +3874,13 @@ void __init rcu_init(void)
38143874
rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
38153875
WARN_ON(!rcu_par_gp_wq);
38163876
srcu_init();
3877+
3878+
/* Fill in default value for rcutree.qovld boot parameter. */
3879+
/* -After- the rcu_node ->lock fields are initialized! */
3880+
if (qovld < 0)
3881+
qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
3882+
else
3883+
qovld_calc = qovld;
38173884
}
38183885

38193886
#include "tree_stall.h"

kernel/rcu/tree.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ struct rcu_node {
6868
/* Online CPUs for next expedited GP. */
6969
/* Any CPU that has ever been online will */
7070
/* have its bit set. */
71+
unsigned long cbovldmask;
72+
/* CPUs experiencing callback overload. */
7173
unsigned long ffmask; /* Fully functional CPUs. */
7274
unsigned long grpmask; /* Mask to apply to parent qsmask. */
7375
/* Only one bit will be set in this mask. */
@@ -321,6 +323,8 @@ struct rcu_state {
321323
atomic_t expedited_need_qs; /* # CPUs left to check in. */
322324
struct swait_queue_head expedited_wq; /* Wait for check-ins. */
323325
int ncpus_snap; /* # CPUs seen last time. */
326+
u8 cbovld; /* Callback overload now? */
327+
u8 cbovldnext; /* ^ ^ next time? */
324328

325329
unsigned long jiffies_force_qs; /* Time at which to invoke */
326330
/* force_quiescent_state(). */

kernel/rcu/tree_plugin.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ static void __init rcu_bootup_announce_oddness(void)
5656
pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
5757
if (qlowmark != DEFAULT_RCU_QLOMARK)
5858
pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
59+
if (qovld != DEFAULT_RCU_QOVLD)
60+
pr_info("\tBoot-time adjustment of callback overload leval to %ld.\n", qovld);
5961
if (jiffies_till_first_fqs != ULONG_MAX)
6062
pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
6163
if (jiffies_till_next_fqs != ULONG_MAX)

0 commit comments

Comments
 (0)