Skip to content

Commit df1e849

Browse files
committed
rcu: Enable tick for nohz_full CPUs slow to provide expedited QS
An expedited grace period can be stalled by a nohz_full CPU looping in kernel context. This possibility is currently handled by some carefully crafted checks in rcu_read_unlock_special() that enlist help from ksoftirqd when permitted by the scheduler. However, it is exactly these checks that require the scheduler avoid holding any of its rq or pi locks across rcu_read_unlock() without also having held them across the entire RCU read-side critical section. It would therefore be very nice if expedited grace periods could handle nohz_full CPUs looping in kernel context without such checks. This commit therefore adds code to the expedited grace period's wait and cleanup code that forces the scheduler-clock interrupt on for CPUs that fail to quickly supply a quiescent state. "Quickly" is currently a hard-coded single-jiffy delay. Signed-off-by: Paul E. McKenney <[email protected]>
1 parent 28f0361 commit df1e849

File tree

3 files changed

+50
-8
lines changed

3 files changed

+50
-8
lines changed

include/linux/tick.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,15 +109,18 @@ enum tick_dep_bits {
109109
TICK_DEP_BIT_PERF_EVENTS = 1,
110110
TICK_DEP_BIT_SCHED = 2,
111111
TICK_DEP_BIT_CLOCK_UNSTABLE = 3,
112-
TICK_DEP_BIT_RCU = 4
112+
TICK_DEP_BIT_RCU = 4,
113+
TICK_DEP_BIT_RCU_EXP = 5
113114
};
115+
#define TICK_DEP_BIT_MAX TICK_DEP_BIT_RCU_EXP
114116

115117
#define TICK_DEP_MASK_NONE 0
116118
#define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER)
117119
#define TICK_DEP_MASK_PERF_EVENTS (1 << TICK_DEP_BIT_PERF_EVENTS)
118120
#define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED)
119121
#define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE)
120122
#define TICK_DEP_MASK_RCU (1 << TICK_DEP_BIT_RCU)
123+
#define TICK_DEP_MASK_RCU_EXP (1 << TICK_DEP_BIT_RCU_EXP)
121124

122125
#ifdef CONFIG_NO_HZ_COMMON
123126
extern bool tick_nohz_enabled;

kernel/rcu/tree.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ struct rcu_data {
182182
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
183183
bool rcu_urgent_qs; /* GP old need light quiescent state. */
184184
bool rcu_forced_tick; /* Forced tick to provide QS. */
185+
bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
185186
#ifdef CONFIG_RCU_FAST_NO_HZ
186187
bool all_lazy; /* All CPU's CBs lazy at idle start? */
187188
unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */

kernel/rcu/tree_exp.h

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -230,14 +230,23 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
230230
static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
231231
unsigned long mask, bool wake)
232232
{
233+
int cpu;
233234
unsigned long flags;
235+
struct rcu_data *rdp;
234236

235237
raw_spin_lock_irqsave_rcu_node(rnp, flags);
236238
if (!(rnp->expmask & mask)) {
237239
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
238240
return;
239241
}
240242
WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
243+
for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
244+
rdp = per_cpu_ptr(&rcu_data, cpu);
245+
if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp)
246+
continue;
247+
rdp->rcu_forced_tick_exp = false;
248+
tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
249+
}
241250
__rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
242251
}
243252

@@ -449,6 +458,26 @@ static void sync_rcu_exp_select_cpus(void)
449458
flush_work(&rnp->rew.rew_work);
450459
}
451460

461+
/*
462+
* Wait for the expedited grace period to elapse, within time limit.
463+
* If the time limit is exceeded without the grace period elapsing,
464+
* return false, otherwise return true.
465+
*/
466+
static bool synchronize_rcu_expedited_wait_once(long tlimit)
467+
{
468+
int t;
469+
struct rcu_node *rnp_root = rcu_get_root();
470+
471+
t = swait_event_timeout_exclusive(rcu_state.expedited_wq,
472+
sync_rcu_exp_done_unlocked(rnp_root),
473+
tlimit);
474+
// Workqueues should not be signaled.
475+
if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root))
476+
return true;
477+
WARN_ON(t < 0); /* workqueues should not be signaled. */
478+
return false;
479+
}
480+
452481
/*
453482
* Wait for the expedited grace period to elapse, issuing any needed
454483
* RCU CPU stall warnings along the way.
@@ -460,22 +489,31 @@ static void synchronize_rcu_expedited_wait(void)
460489
unsigned long jiffies_start;
461490
unsigned long mask;
462491
int ndetected;
492+
struct rcu_data *rdp;
463493
struct rcu_node *rnp;
464494
struct rcu_node *rnp_root = rcu_get_root();
465-
int ret;
466495

467496
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
468497
jiffies_stall = rcu_jiffies_till_stall_check();
469498
jiffies_start = jiffies;
499+
if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
500+
if (synchronize_rcu_expedited_wait_once(1))
501+
return;
502+
rcu_for_each_leaf_node(rnp) {
503+
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
504+
rdp = per_cpu_ptr(&rcu_data, cpu);
505+
if (rdp->rcu_forced_tick_exp)
506+
continue;
507+
rdp->rcu_forced_tick_exp = true;
508+
tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
509+
}
510+
}
511+
WARN_ON_ONCE(1);
512+
}
470513

471514
for (;;) {
472-
ret = swait_event_timeout_exclusive(
473-
rcu_state.expedited_wq,
474-
sync_rcu_exp_done_unlocked(rnp_root),
475-
jiffies_stall);
476-
if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root))
515+
if (synchronize_rcu_expedited_wait_once(jiffies_stall))
477516
return;
478-
WARN_ON(ret < 0); /* workqueues should not be signaled. */
479517
if (rcu_cpu_stall_suppress)
480518
continue;
481519
panic_on_rcu_stall();

0 commit comments

Comments
 (0)