Skip to content

Commit e5f9201

Browse files
joelagnelchantra
authored andcommitted
rcu: Make call_rcu() lazy to save power
Implement timer-based RCU callback batching (also known as lazy callbacks). With this we save about 5-10% of power consumed due to RCU requests that happen when system is lightly loaded or idle. By default, all async callbacks (queued via call_rcu) are marked lazy. An alternate API call_rcu_flush() is provided for the few users, for example synchronize_rcu(), that need the old behavior. The batch is flushed whenever a certain amount of time has passed, or the batch on a particular CPU grows too big. Also memory pressure will flush it in a future patch. To handle several corner cases automagically (such as rcu_barrier() and hotplug), we re-use bypass lists which were originally introduced to address lock contention, to handle lazy CBs as well. The bypass list length has the lazy CB length included in it. A separate lazy CB length counter is also introduced to keep track of the number of lazy CBs. [ paulmck: Fix formatting of inline call_rcu_lazy() definition. ] Suggested-by: Paul McKenney <[email protected]> Acked-by: Frederic Weisbecker <[email protected]> Signed-off-by: Joel Fernandes (Google) <[email protected]> Signed-off-by: Paul E. McKenney <[email protected]>
1 parent dbcf642 commit e5f9201

File tree

8 files changed

+246
-82
lines changed

8 files changed

+246
-82
lines changed

include/linux/rcupdate.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,15 @@ static inline int rcu_preempt_depth(void)
108108

109109
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
110110

111+
#ifdef CONFIG_RCU_LAZY
112+
void call_rcu_flush(struct rcu_head *head, rcu_callback_t func);
113+
#else
114+
static inline void call_rcu_flush(struct rcu_head *head, rcu_callback_t func)
115+
{
116+
call_rcu(head, func);
117+
}
118+
#endif
119+
111120
/* Internal to kernel */
112121
void rcu_init(void);
113122
extern int rcu_scheduler_active;

kernel/rcu/Kconfig

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,4 +314,12 @@ config TASKS_TRACE_RCU_READ_MB
314314
Say N here if you hate read-side memory barriers.
315315
Take the default if you are unsure.
316316

317+
config RCU_LAZY
318+
bool "RCU callback lazy invocation functionality"
319+
depends on RCU_NOCB_CPU
320+
default n
321+
help
322+
To save power, batch RCU callbacks and flush after delay, memory
323+
pressure, or callback list growing too big.
324+
317325
endmenu # "RCU Subsystem"

kernel/rcu/rcu.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,14 @@ enum rcutorture_type {
474474
INVALID_RCU_FLAVOR
475475
};
476476

477+
#if defined(CONFIG_RCU_LAZY)
478+
unsigned long rcu_lazy_get_jiffies_till_flush(void);
479+
void rcu_lazy_set_jiffies_till_flush(unsigned long j);
480+
#else
481+
static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
482+
static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
483+
#endif
484+
477485
#if defined(CONFIG_TREE_RCU)
478486
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
479487
unsigned long *gp_seq);

kernel/rcu/tiny.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
4444

4545
void rcu_barrier(void)
4646
{
47-
wait_rcu_gp(call_rcu);
47+
wait_rcu_gp(call_rcu_flush);
4848
}
4949
EXPORT_SYMBOL(rcu_barrier);
5050

kernel/rcu/tree.c

Lines changed: 83 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2748,47 +2748,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
27482748
raw_spin_unlock_rcu_node(rnp);
27492749
}
27502750

2751-
/**
2752-
* call_rcu() - Queue an RCU callback for invocation after a grace period.
2753-
* @head: structure to be used for queueing the RCU updates.
2754-
* @func: actual callback function to be invoked after the grace period
2755-
*
2756-
* The callback function will be invoked some time after a full grace
2757-
* period elapses, in other words after all pre-existing RCU read-side
2758-
* critical sections have completed. However, the callback function
2759-
* might well execute concurrently with RCU read-side critical sections
2760-
* that started after call_rcu() was invoked.
2761-
*
2762-
* RCU read-side critical sections are delimited by rcu_read_lock()
2763-
* and rcu_read_unlock(), and may be nested. In addition, but only in
2764-
* v5.0 and later, regions of code across which interrupts, preemption,
2765-
* or softirqs have been disabled also serve as RCU read-side critical
2766-
* sections. This includes hardware interrupt handlers, softirq handlers,
2767-
* and NMI handlers.
2768-
*
2769-
* Note that all CPUs must agree that the grace period extended beyond
2770-
* all pre-existing RCU read-side critical section. On systems with more
2771-
* than one CPU, this means that when "func()" is invoked, each CPU is
2772-
* guaranteed to have executed a full memory barrier since the end of its
2773-
* last RCU read-side critical section whose beginning preceded the call
2774-
* to call_rcu(). It also means that each CPU executing an RCU read-side
2775-
* critical section that continues beyond the start of "func()" must have
2776-
* executed a memory barrier after the call_rcu() but before the beginning
2777-
* of that RCU read-side critical section. Note that these guarantees
2778-
* include CPUs that are offline, idle, or executing in user mode, as
2779-
* well as CPUs that are executing in the kernel.
2780-
*
2781-
* Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
2782-
* resulting RCU callback function "func()", then both CPU A and CPU B are
2783-
* guaranteed to execute a full memory barrier during the time interval
2784-
* between the call to call_rcu() and the invocation of "func()" -- even
2785-
* if CPU A and CPU B are the same CPU (but again only if the system has
2786-
* more than one CPU).
2787-
*
2788-
* Implementation of these memory-ordering guarantees is described here:
2789-
* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
2790-
*/
2791-
void call_rcu(struct rcu_head *head, rcu_callback_t func)
2751+
static void
2752+
__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
27922753
{
27932754
static atomic_t doublefrees;
27942755
unsigned long flags;
@@ -2829,7 +2790,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
28292790
}
28302791

28312792
check_cb_ovld(rdp);
2832-
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
2793+
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
28332794
return; // Enqueued onto ->nocb_bypass, so just leave.
28342795
// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
28352796
rcu_segcblist_enqueue(&rdp->cblist, head);
@@ -2851,8 +2812,84 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
28512812
local_irq_restore(flags);
28522813
}
28532814
}
2854-
EXPORT_SYMBOL_GPL(call_rcu);
28552815

2816+
#ifdef CONFIG_RCU_LAZY
2817+
/**
2818+
* call_rcu_flush() - Queue RCU callback for invocation after grace period, and
2819+
* flush all lazy callbacks (including the new one) to the main ->cblist while
2820+
* doing so.
2821+
*
2822+
* @head: structure to be used for queueing the RCU updates.
2823+
* @func: actual callback function to be invoked after the grace period
2824+
*
2825+
* The callback function will be invoked some time after a full grace
2826+
* period elapses, in other words after all pre-existing RCU read-side
2827+
* critical sections have completed.
2828+
*
2829+
* Use this API instead of call_rcu() if you don't want the callback to be
2830+
* invoked after very long periods of time, which can happen on systems without
2831+
* memory pressure and on systems which are lightly loaded or mostly idle.
2832+
* This function will cause callbacks to be invoked sooner than later at the
2833+
* expense of extra power. Other than that, this function is identical to, and
2834+
* reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
2835+
* ordering and other functionality.
2836+
*/
2837+
void call_rcu_flush(struct rcu_head *head, rcu_callback_t func)
2838+
{
2839+
return __call_rcu_common(head, func, false);
2840+
}
2841+
EXPORT_SYMBOL_GPL(call_rcu_flush);
2842+
#endif
2843+
2844+
/**
2845+
* call_rcu() - Queue an RCU callback for invocation after a grace period.
2846+
* By default the callbacks are 'lazy' and are kept hidden from the main
2847+
* ->cblist to prevent starting of grace periods too soon.
2848+
* If you desire grace periods to start very soon, use call_rcu_flush().
2849+
*
2850+
* @head: structure to be used for queueing the RCU updates.
2851+
* @func: actual callback function to be invoked after the grace period
2852+
*
2853+
* The callback function will be invoked some time after a full grace
2854+
* period elapses, in other words after all pre-existing RCU read-side
2855+
* critical sections have completed. However, the callback function
2856+
* might well execute concurrently with RCU read-side critical sections
2857+
* that started after call_rcu() was invoked.
2858+
*
2859+
* RCU read-side critical sections are delimited by rcu_read_lock()
2860+
* and rcu_read_unlock(), and may be nested. In addition, but only in
2861+
* v5.0 and later, regions of code across which interrupts, preemption,
2862+
* or softirqs have been disabled also serve as RCU read-side critical
2863+
* sections. This includes hardware interrupt handlers, softirq handlers,
2864+
* and NMI handlers.
2865+
*
2866+
* Note that all CPUs must agree that the grace period extended beyond
2867+
* all pre-existing RCU read-side critical section. On systems with more
2868+
* than one CPU, this means that when "func()" is invoked, each CPU is
2869+
* guaranteed to have executed a full memory barrier since the end of its
2870+
* last RCU read-side critical section whose beginning preceded the call
2871+
* to call_rcu(). It also means that each CPU executing an RCU read-side
2872+
* critical section that continues beyond the start of "func()" must have
2873+
* executed a memory barrier after the call_rcu() but before the beginning
2874+
* of that RCU read-side critical section. Note that these guarantees
2875+
* include CPUs that are offline, idle, or executing in user mode, as
2876+
* well as CPUs that are executing in the kernel.
2877+
*
2878+
* Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
2879+
* resulting RCU callback function "func()", then both CPU A and CPU B are
2880+
* guaranteed to execute a full memory barrier during the time interval
2881+
* between the call to call_rcu() and the invocation of "func()" -- even
2882+
* if CPU A and CPU B are the same CPU (but again only if the system has
2883+
* more than one CPU).
2884+
*
2885+
* Implementation of these memory-ordering guarantees is described here:
2886+
* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
2887+
*/
2888+
void call_rcu(struct rcu_head *head, rcu_callback_t func)
2889+
{
2890+
return __call_rcu_common(head, func, true);
2891+
}
2892+
EXPORT_SYMBOL_GPL(call_rcu);
28562893

28572894
/* Maximum number of jiffies to wait before draining a batch. */
28582895
#define KFREE_DRAIN_JIFFIES (5 * HZ)
@@ -3527,7 +3564,7 @@ void synchronize_rcu(void)
35273564
if (rcu_gp_is_expedited())
35283565
synchronize_rcu_expedited();
35293566
else
3530-
wait_rcu_gp(call_rcu);
3567+
wait_rcu_gp(call_rcu_flush);
35313568
return;
35323569
}
35333570

@@ -3930,7 +3967,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
39303967
* if it's fully lazy.
39313968
*/
39323969
was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
3933-
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
3970+
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
39343971
wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
39353972
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
39363973
atomic_inc(&rcu_state.barrier_cpu_count);
@@ -4354,7 +4391,7 @@ void rcutree_migrate_callbacks(int cpu)
43544391
my_rdp = this_cpu_ptr(&rcu_data);
43554392
my_rnp = my_rdp->mynode;
43564393
rcu_nocb_lock(my_rdp); /* irqs already disabled. */
4357-
WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
4394+
WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
43584395
raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
43594396
/* Leverage recent GPs and set GP for new callbacks. */
43604397
needwake = rcu_advance_cbs(my_rnp, rdp) ||

kernel/rcu/tree.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -263,14 +263,16 @@ struct rcu_data {
263263
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
264264
unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
265265

266+
long lazy_len; /* Length of buffered lazy callbacks. */
266267
int cpu;
267268
};
268269

269270
/* Values for nocb_defer_wakeup field in struct rcu_data. */
270271
#define RCU_NOCB_WAKE_NOT 0
271272
#define RCU_NOCB_WAKE_BYPASS 1
272-
#define RCU_NOCB_WAKE 2
273-
#define RCU_NOCB_WAKE_FORCE 3
273+
#define RCU_NOCB_WAKE_LAZY 2
274+
#define RCU_NOCB_WAKE 3
275+
#define RCU_NOCB_WAKE_FORCE 4
274276

275277
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
276278
/* For jiffies_till_first_fqs and */
@@ -441,9 +443,10 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
441443
static void rcu_init_one_nocb(struct rcu_node *rnp);
442444
static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
443445
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
444-
unsigned long j);
446+
unsigned long j, bool lazy);
445447
static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
446-
bool *was_alldone, unsigned long flags);
448+
bool *was_alldone, unsigned long flags,
449+
bool lazy);
447450
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
448451
unsigned long flags);
449452
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);

kernel/rcu/tree_exp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -937,7 +937,7 @@ void synchronize_rcu_expedited(void)
937937

938938
/* If expedited grace periods are prohibited, fall back to normal. */
939939
if (rcu_gp_is_normal()) {
940-
wait_rcu_gp(call_rcu);
940+
wait_rcu_gp(call_rcu_flush);
941941
return;
942942
}
943943

0 commit comments

Comments
 (0)