|
| 1 | +/* SPDX-License-Identifier: GPL-2.0+ */ |
| 2 | +/* |
| 3 | + * Task-based RCU implementations. |
| 4 | + * |
| 5 | + * Copyright (C) 2020 Paul E. McKenney |
| 6 | + */ |
| 7 | + |
| 8 | +#ifdef CONFIG_TASKS_RCU |
| 9 | + |
| 10 | +/* |
| 11 | + * Simple variant of RCU whose quiescent states are voluntary context |
| 12 | + * switch, cond_resched_rcu_qs(), user-space execution, and idle. |
| 13 | + * As such, grace periods can take one good long time. There are no |
| 14 | + * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() |
| 15 | + * because this implementation is intended to get the system into a safe |
| 16 | + * state for some of the manipulations involved in tracing and the like. |
| 17 | + * Finally, this implementation does not support high call_rcu_tasks() |
| 18 | + * rates from multiple CPUs. If this is required, per-CPU callback lists |
| 19 | + * will be needed. |
| 20 | + */ |
| 21 | + |
| 22 | +/* Global list of callbacks and associated lock. */ |
| 23 | +static struct rcu_head *rcu_tasks_cbs_head; |
| 24 | +static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; |
| 25 | +static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); |
| 26 | +static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); |
| 27 | + |
| 28 | +/* Track exiting tasks in order to allow them to be waited for. */ |
| 29 | +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); |
| 30 | + |
| 31 | +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ |
| 32 | +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) |
| 33 | +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; |
| 34 | +module_param(rcu_task_stall_timeout, int, 0644); |
| 35 | + |
| 36 | +static struct task_struct *rcu_tasks_kthread_ptr; |
| 37 | + |
| 38 | +/** |
| 39 | + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period |
| 40 | + * @rhp: structure to be used for queueing the RCU updates. |
| 41 | + * @func: actual callback function to be invoked after the grace period |
| 42 | + * |
| 43 | + * The callback function will be invoked some time after a full grace |
| 44 | + * period elapses, in other words after all currently executing RCU |
| 45 | + * read-side critical sections have completed. call_rcu_tasks() assumes |
| 46 | + * that the read-side critical sections end at a voluntary context |
| 47 | + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, |
| 48 | + * or transition to usermode execution. As such, there are no read-side |
| 49 | + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because |
| 50 | + * this primitive is intended to determine that all tasks have passed |
| 51 | + * through a safe state, not so much for data-strcuture synchronization. |
| 52 | + * |
| 53 | + * See the description of call_rcu() for more detailed information on |
| 54 | + * memory ordering guarantees. |
| 55 | + */ |
| 56 | +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) |
| 57 | +{ |
| 58 | + unsigned long flags; |
| 59 | + bool needwake; |
| 60 | + |
| 61 | + rhp->next = NULL; |
| 62 | + rhp->func = func; |
| 63 | + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); |
| 64 | + needwake = !rcu_tasks_cbs_head; |
| 65 | + WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); |
| 66 | + rcu_tasks_cbs_tail = &rhp->next; |
| 67 | + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); |
| 68 | + /* We can't create the thread unless interrupts are enabled. */ |
| 69 | + if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) |
| 70 | + wake_up(&rcu_tasks_cbs_wq); |
| 71 | +} |
| 72 | +EXPORT_SYMBOL_GPL(call_rcu_tasks); |
| 73 | + |
| 74 | +/** |
| 75 | + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. |
| 76 | + * |
| 77 | + * Control will return to the caller some time after a full rcu-tasks |
| 78 | + * grace period has elapsed, in other words after all currently |
| 79 | + * executing rcu-tasks read-side critical sections have elapsed. These |
| 80 | + * read-side critical sections are delimited by calls to schedule(), |
| 81 | + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls |
| 82 | + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). |
| 83 | + * |
| 84 | + * This is a very specialized primitive, intended only for a few uses in |
| 85 | + * tracing and other situations requiring manipulation of function |
| 86 | + * preambles and profiling hooks. The synchronize_rcu_tasks() function |
| 87 | + * is not (yet) intended for heavy use from multiple CPUs. |
| 88 | + * |
| 89 | + * Note that this guarantee implies further memory-ordering guarantees. |
| 90 | + * On systems with more than one CPU, when synchronize_rcu_tasks() returns, |
| 91 | + * each CPU is guaranteed to have executed a full memory barrier since the |
| 92 | + * end of its last RCU-tasks read-side critical section whose beginning |
| 93 | + * preceded the call to synchronize_rcu_tasks(). In addition, each CPU |
| 94 | + * having an RCU-tasks read-side critical section that extends beyond |
| 95 | + * the return from synchronize_rcu_tasks() is guaranteed to have executed |
| 96 | + * a full memory barrier after the beginning of synchronize_rcu_tasks() |
| 97 | + * and before the beginning of that RCU-tasks read-side critical section. |
| 98 | + * Note that these guarantees include CPUs that are offline, idle, or |
| 99 | + * executing in user mode, as well as CPUs that are executing in the kernel. |
| 100 | + * |
| 101 | + * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned |
| 102 | + * to its caller on CPU B, then both CPU A and CPU B are guaranteed |
| 103 | + * to have executed a full memory barrier during the execution of |
| 104 | + * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU |
| 105 | + * (but again only if the system has more than one CPU). |
| 106 | + */ |
| 107 | +void synchronize_rcu_tasks(void) |
| 108 | +{ |
| 109 | + /* Complain if the scheduler has not started. */ |
| 110 | + RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, |
| 111 | + "synchronize_rcu_tasks called too soon"); |
| 112 | + |
| 113 | + /* Wait for the grace period. */ |
| 114 | + wait_rcu_gp(call_rcu_tasks); |
| 115 | +} |
| 116 | +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); |
| 117 | + |
| 118 | +/** |
| 119 | + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. |
| 120 | + * |
| 121 | + * Although the current implementation is guaranteed to wait, it is not |
| 122 | + * obligated to, for example, if there are no pending callbacks. |
| 123 | + */ |
| 124 | +void rcu_barrier_tasks(void) |
| 125 | +{ |
| 126 | + /* There is only one callback queue, so this is easy. ;-) */ |
| 127 | + synchronize_rcu_tasks(); |
| 128 | +} |
| 129 | +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); |
| 130 | + |
| 131 | +/* See if tasks are still holding out, complain if so. */ |
| 132 | +static void check_holdout_task(struct task_struct *t, |
| 133 | + bool needreport, bool *firstreport) |
| 134 | +{ |
| 135 | + int cpu; |
| 136 | + |
| 137 | + if (!READ_ONCE(t->rcu_tasks_holdout) || |
| 138 | + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || |
| 139 | + !READ_ONCE(t->on_rq) || |
| 140 | + (IS_ENABLED(CONFIG_NO_HZ_FULL) && |
| 141 | + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { |
| 142 | + WRITE_ONCE(t->rcu_tasks_holdout, false); |
| 143 | + list_del_init(&t->rcu_tasks_holdout_list); |
| 144 | + put_task_struct(t); |
| 145 | + return; |
| 146 | + } |
| 147 | + rcu_request_urgent_qs_task(t); |
| 148 | + if (!needreport) |
| 149 | + return; |
| 150 | + if (*firstreport) { |
| 151 | + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); |
| 152 | + *firstreport = false; |
| 153 | + } |
| 154 | + cpu = task_cpu(t); |
| 155 | + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", |
| 156 | + t, ".I"[is_idle_task(t)], |
| 157 | + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], |
| 158 | + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, |
| 159 | + t->rcu_tasks_idle_cpu, cpu); |
| 160 | + sched_show_task(t); |
| 161 | +} |
| 162 | + |
| 163 | +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ |
| 164 | +static int __noreturn rcu_tasks_kthread(void *arg) |
| 165 | +{ |
| 166 | + unsigned long flags; |
| 167 | + struct task_struct *g, *t; |
| 168 | + unsigned long lastreport; |
| 169 | + struct rcu_head *list; |
| 170 | + struct rcu_head *next; |
| 171 | + LIST_HEAD(rcu_tasks_holdouts); |
| 172 | + int fract; |
| 173 | + |
| 174 | + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ |
| 175 | + housekeeping_affine(current, HK_FLAG_RCU); |
| 176 | + |
| 177 | + /* |
| 178 | + * Each pass through the following loop makes one check for |
| 179 | + * newly arrived callbacks, and, if there are some, waits for |
| 180 | + * one RCU-tasks grace period and then invokes the callbacks. |
| 181 | + * This loop is terminated by the system going down. ;-) |
| 182 | + */ |
| 183 | + for (;;) { |
| 184 | + |
| 185 | + /* Pick up any new callbacks. */ |
| 186 | + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); |
| 187 | + list = rcu_tasks_cbs_head; |
| 188 | + rcu_tasks_cbs_head = NULL; |
| 189 | + rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; |
| 190 | + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); |
| 191 | + |
| 192 | + /* If there were none, wait a bit and start over. */ |
| 193 | + if (!list) { |
| 194 | + wait_event_interruptible(rcu_tasks_cbs_wq, |
| 195 | + READ_ONCE(rcu_tasks_cbs_head)); |
| 196 | + if (!rcu_tasks_cbs_head) { |
| 197 | + WARN_ON(signal_pending(current)); |
| 198 | + schedule_timeout_interruptible(HZ/10); |
| 199 | + } |
| 200 | + continue; |
| 201 | + } |
| 202 | + |
| 203 | + /* |
| 204 | + * Wait for all pre-existing t->on_rq and t->nvcsw |
| 205 | + * transitions to complete. Invoking synchronize_rcu() |
| 206 | + * suffices because all these transitions occur with |
| 207 | + * interrupts disabled. Without this synchronize_rcu(), |
| 208 | + * a read-side critical section that started before the |
| 209 | + * grace period might be incorrectly seen as having started |
| 210 | + * after the grace period. |
| 211 | + * |
| 212 | + * This synchronize_rcu() also dispenses with the |
| 213 | + * need for a memory barrier on the first store to |
| 214 | + * ->rcu_tasks_holdout, as it forces the store to happen |
| 215 | + * after the beginning of the grace period. |
| 216 | + */ |
| 217 | + synchronize_rcu(); |
| 218 | + |
| 219 | + /* |
| 220 | + * There were callbacks, so we need to wait for an |
| 221 | + * RCU-tasks grace period. Start off by scanning |
| 222 | + * the task list for tasks that are not already |
| 223 | + * voluntarily blocked. Mark these tasks and make |
| 224 | + * a list of them in rcu_tasks_holdouts. |
| 225 | + */ |
| 226 | + rcu_read_lock(); |
| 227 | + for_each_process_thread(g, t) { |
| 228 | + if (t != current && READ_ONCE(t->on_rq) && |
| 229 | + !is_idle_task(t)) { |
| 230 | + get_task_struct(t); |
| 231 | + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); |
| 232 | + WRITE_ONCE(t->rcu_tasks_holdout, true); |
| 233 | + list_add(&t->rcu_tasks_holdout_list, |
| 234 | + &rcu_tasks_holdouts); |
| 235 | + } |
| 236 | + } |
| 237 | + rcu_read_unlock(); |
| 238 | + |
| 239 | + /* |
| 240 | + * Wait for tasks that are in the process of exiting. |
| 241 | + * This does only part of the job, ensuring that all |
| 242 | + * tasks that were previously exiting reach the point |
| 243 | + * where they have disabled preemption, allowing the |
| 244 | + * later synchronize_rcu() to finish the job. |
| 245 | + */ |
| 246 | + synchronize_srcu(&tasks_rcu_exit_srcu); |
| 247 | + |
| 248 | + /* |
| 249 | + * Each pass through the following loop scans the list |
| 250 | + * of holdout tasks, removing any that are no longer |
| 251 | + * holdouts. When the list is empty, we are done. |
| 252 | + */ |
| 253 | + lastreport = jiffies; |
| 254 | + |
| 255 | + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ |
| 256 | + fract = 10; |
| 257 | + |
| 258 | + for (;;) { |
| 259 | + bool firstreport; |
| 260 | + bool needreport; |
| 261 | + int rtst; |
| 262 | + struct task_struct *t1; |
| 263 | + |
| 264 | + if (list_empty(&rcu_tasks_holdouts)) |
| 265 | + break; |
| 266 | + |
| 267 | + /* Slowly back off waiting for holdouts */ |
| 268 | + schedule_timeout_interruptible(HZ/fract); |
| 269 | + |
| 270 | + if (fract > 1) |
| 271 | + fract--; |
| 272 | + |
| 273 | + rtst = READ_ONCE(rcu_task_stall_timeout); |
| 274 | + needreport = rtst > 0 && |
| 275 | + time_after(jiffies, lastreport + rtst); |
| 276 | + if (needreport) |
| 277 | + lastreport = jiffies; |
| 278 | + firstreport = true; |
| 279 | + WARN_ON(signal_pending(current)); |
| 280 | + list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, |
| 281 | + rcu_tasks_holdout_list) { |
| 282 | + check_holdout_task(t, needreport, &firstreport); |
| 283 | + cond_resched(); |
| 284 | + } |
| 285 | + } |
| 286 | + |
| 287 | + /* |
| 288 | + * Because ->on_rq and ->nvcsw are not guaranteed |
| 289 | + * to have a full memory barriers prior to them in the |
| 290 | + * schedule() path, memory reordering on other CPUs could |
| 291 | + * cause their RCU-tasks read-side critical sections to |
| 292 | + * extend past the end of the grace period. However, |
| 293 | + * because these ->nvcsw updates are carried out with |
| 294 | + * interrupts disabled, we can use synchronize_rcu() |
| 295 | + * to force the needed ordering on all such CPUs. |
| 296 | + * |
| 297 | + * This synchronize_rcu() also confines all |
| 298 | + * ->rcu_tasks_holdout accesses to be within the grace |
| 299 | + * period, avoiding the need for memory barriers for |
| 300 | + * ->rcu_tasks_holdout accesses. |
| 301 | + * |
| 302 | + * In addition, this synchronize_rcu() waits for exiting |
| 303 | + * tasks to complete their final preempt_disable() region |
| 304 | + * of execution, cleaning up after the synchronize_srcu() |
| 305 | + * above. |
| 306 | + */ |
| 307 | + synchronize_rcu(); |
| 308 | + |
| 309 | + /* Invoke the callbacks. */ |
| 310 | + while (list) { |
| 311 | + next = list->next; |
| 312 | + local_bh_disable(); |
| 313 | + list->func(list); |
| 314 | + local_bh_enable(); |
| 315 | + list = next; |
| 316 | + cond_resched(); |
| 317 | + } |
| 318 | + /* Paranoid sleep to keep this from entering a tight loop */ |
| 319 | + schedule_timeout_uninterruptible(HZ/10); |
| 320 | + } |
| 321 | +} |
| 322 | + |
| 323 | +/* Spawn rcu_tasks_kthread() at core_initcall() time. */ |
| 324 | +static int __init rcu_spawn_tasks_kthread(void) |
| 325 | +{ |
| 326 | + struct task_struct *t; |
| 327 | + |
| 328 | + t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); |
| 329 | + if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) |
| 330 | + return 0; |
| 331 | + smp_mb(); /* Ensure others see full kthread. */ |
| 332 | + WRITE_ONCE(rcu_tasks_kthread_ptr, t); |
| 333 | + return 0; |
| 334 | +} |
| 335 | +core_initcall(rcu_spawn_tasks_kthread); |
| 336 | + |
| 337 | +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ |
| 338 | +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) |
| 339 | +{ |
| 340 | + preempt_disable(); |
| 341 | + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); |
| 342 | + preempt_enable(); |
| 343 | +} |
| 344 | + |
| 345 | +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ |
| 346 | +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) |
| 347 | +{ |
| 348 | + preempt_disable(); |
| 349 | + __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); |
| 350 | + preempt_enable(); |
| 351 | +} |
| 352 | + |
| 353 | +#endif /* #ifdef CONFIG_TASKS_RCU */ |
| 354 | + |
| 355 | +#ifndef CONFIG_TINY_RCU |
| 356 | + |
| 357 | +/* |
| 358 | + * Print any non-default Tasks RCU settings. |
| 359 | + */ |
| 360 | +static void __init rcu_tasks_bootup_oddness(void) |
| 361 | +{ |
| 362 | +#ifdef CONFIG_TASKS_RCU |
| 363 | + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) |
| 364 | + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); |
| 365 | + else |
| 366 | + pr_info("\tTasks RCU enabled.\n"); |
| 367 | +#endif /* #ifdef CONFIG_TASKS_RCU */ |
| 368 | +} |
| 369 | + |
| 370 | +#endif /* #ifndef CONFIG_TINY_RCU */ |
0 commit comments