Skip to content

Commit a148866

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched: Replace rq::wake_list
The recent commit: 90b5363 ("sched: Clean up scheduler_ipi()") got smp_call_function_single_async() subtly wrong. Even though it will return -EBUSY when trying to re-use a csd, that condition is not atomic and still requires external serialization. The change in ttwu_queue_remote() got this wrong. While on first reading ttwu_queue_remote() has an atomic test-and-set that appears to serialize the use, the matching 'release' is not in the right place to actually guarantee this serialization. The actual race is vs the sched_ttwu_pending() call in the idle loop; that can run the wakeup-list without consuming the CSD. Instead of trying to chain the lists, merge them. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 126c209 commit a148866

File tree

6 files changed

+49
-34
lines changed

6 files changed

+49
-34
lines changed

include/linux/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,7 @@ struct task_struct {
654654

655655
#ifdef CONFIG_SMP
656656
struct llist_node wake_entry;
657+
unsigned int wake_entry_type;
657658
int on_cpu;
658659
#ifdef CONFIG_THREAD_INFO_IN_TASK
659660
/* Current CPU: */

include/linux/smp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ enum {
2525
CSD_TYPE_ASYNC = 0x00,
2626
CSD_TYPE_SYNC = 0x10,
2727
CSD_TYPE_IRQ_WORK = 0x20,
28+
CSD_TYPE_TTWU = 0x30,
2829
CSD_FLAG_TYPE_MASK = 0xF0,
2930
};
3031

kernel/sched/core.c

Lines changed: 7 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1538,7 +1538,7 @@ static int migration_cpu_stop(void *data)
15381538
* __migrate_task() such that we will not miss enforcing cpus_ptr
15391539
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
15401540
*/
1541-
sched_ttwu_pending();
1541+
flush_smp_call_function_from_idle();
15421542

15431543
raw_spin_lock(&p->pi_lock);
15441544
rq_lock(rq, &rf);
@@ -2272,14 +2272,13 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
22722272
}
22732273

22742274
#ifdef CONFIG_SMP
2275-
void sched_ttwu_pending(void)
2275+
void sched_ttwu_pending(void *arg)
22762276
{
2277+
struct llist_node *llist = arg;
22772278
struct rq *rq = this_rq();
2278-
struct llist_node *llist;
22792279
struct task_struct *p, *t;
22802280
struct rq_flags rf;
22812281

2282-
llist = llist_del_all(&rq->wake_list);
22832282
if (!llist)
22842283
return;
22852284

@@ -2299,11 +2298,6 @@ void sched_ttwu_pending(void)
22992298
rq_unlock_irqrestore(rq, &rf);
23002299
}
23012300

2302-
static void wake_csd_func(void *info)
2303-
{
2304-
sched_ttwu_pending();
2305-
}
2306-
23072301
void send_call_function_single_ipi(int cpu)
23082302
{
23092303
struct rq *rq = cpu_rq(cpu);
@@ -2327,12 +2321,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
23272321
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
23282322

23292323
WRITE_ONCE(rq->ttwu_pending, 1);
2330-
if (llist_add(&p->wake_entry, &rq->wake_list)) {
2331-
if (!set_nr_if_polling(rq->idle))
2332-
smp_call_function_single_async(cpu, &rq->wake_csd);
2333-
else
2334-
trace_sched_wake_idle_without_ipi(cpu);
2335-
}
2324+
__smp_call_single_queue(cpu, &p->wake_entry);
23362325
}
23372326

23382327
void wake_up_if_idle(int cpu)
@@ -2772,6 +2761,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
27722761
p->capture_control = NULL;
27732762
#endif
27742763
init_numa_balancing(clone_flags, p);
2764+
#ifdef CONFIG_SMP
2765+
p->wake_entry_type = CSD_TYPE_TTWU;
2766+
#endif
27752767
}
27762768

27772769
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -6564,7 +6556,6 @@ int sched_cpu_dying(unsigned int cpu)
65646556
struct rq_flags rf;
65656557

65666558
/* Handle pending wakeups and then migrate everything off */
6567-
sched_ttwu_pending();
65686559
sched_tick_stop(cpu);
65696560

65706561
rq_lock_irqsave(rq, &rf);
@@ -6763,8 +6754,6 @@ void __init sched_init(void)
67636754
rq->avg_idle = 2*sysctl_sched_migration_cost;
67646755
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
67656756

6766-
rq_csd_init(rq, &rq->wake_csd, wake_csd_func);
6767-
67686757
INIT_LIST_HEAD(&rq->cfs_tasks);
67696758

67706759
rq_attach_root(rq, &def_root_domain);

kernel/sched/idle.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,6 @@ static void do_idle(void)
294294
* critical section.
295295
*/
296296
flush_smp_call_function_from_idle();
297-
sched_ttwu_pending();
298297
schedule_idle();
299298

300299
if (unlikely(klp_patch_pending(current)))

kernel/sched/sched.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,11 +1023,6 @@ struct rq {
10231023
unsigned int ttwu_local;
10241024
#endif
10251025

1026-
#ifdef CONFIG_SMP
1027-
call_single_data_t wake_csd;
1028-
struct llist_head wake_list;
1029-
#endif
1030-
10311026
#ifdef CONFIG_CPU_IDLE
10321027
/* Must be inspected within a rcu lock section */
10331028
struct cpuidle_state *idle_state;
@@ -1371,8 +1366,6 @@ queue_balance_callback(struct rq *rq,
13711366
rq->balance_callback = head;
13721367
}
13731368

1374-
extern void sched_ttwu_pending(void);
1375-
13761369
#define rcu_dereference_check_sched_domain(p) \
13771370
rcu_dereference_check((p), \
13781371
lockdep_is_held(&sched_domains_mutex))
@@ -1512,7 +1505,6 @@ extern void flush_smp_call_function_from_idle(void);
15121505

15131506
#else /* !CONFIG_SMP: */
15141507
static inline void flush_smp_call_function_from_idle(void) { }
1515-
static inline void sched_ttwu_pending(void) { }
15161508
#endif
15171509

15181510
#include "stats.h"

kernel/smp.c

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ void generic_smp_call_function_single_interrupt(void)
196196
flush_smp_call_function_queue(true);
197197
}
198198

199+
extern void sched_ttwu_pending(void *);
199200
extern void irq_work_single(void *);
200201

201202
/**
@@ -244,6 +245,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
244245
csd->func);
245246
break;
246247

248+
case CSD_TYPE_TTWU:
249+
pr_warn("IPI task-wakeup sent to offline CPU\n");
250+
break;
251+
247252
default:
248253
pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
249254
CSD_TYPE(csd));
@@ -275,22 +280,43 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
275280
}
276281
}
277282

283+
if (!entry)
284+
return;
285+
278286
/*
279287
* Second; run all !SYNC callbacks.
280288
*/
289+
prev = NULL;
281290
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
282291
int type = CSD_TYPE(csd);
283292

284-
if (type == CSD_TYPE_ASYNC) {
285-
smp_call_func_t func = csd->func;
286-
void *info = csd->info;
293+
if (type != CSD_TYPE_TTWU) {
294+
if (prev) {
295+
prev->next = &csd_next->llist;
296+
} else {
297+
entry = &csd_next->llist;
298+
}
287299

288-
csd_unlock(csd);
289-
func(info);
290-
} else if (type == CSD_TYPE_IRQ_WORK) {
291-
irq_work_single(csd);
300+
if (type == CSD_TYPE_ASYNC) {
301+
smp_call_func_t func = csd->func;
302+
void *info = csd->info;
303+
304+
csd_unlock(csd);
305+
func(info);
306+
} else if (type == CSD_TYPE_IRQ_WORK) {
307+
irq_work_single(csd);
308+
}
309+
310+
} else {
311+
prev = &csd->llist;
292312
}
293313
}
314+
315+
/*
316+
* Third; only CSD_TYPE_TTWU is left, issue those.
317+
*/
318+
if (entry)
319+
sched_ttwu_pending(entry);
294320
}
295321

296322
void flush_smp_call_function_from_idle(void)
@@ -659,6 +685,13 @@ void __init smp_init(void)
659685
BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
660686
offsetof(struct __call_single_data, flags));
661687

688+
/*
689+
* Assert the CSD_TYPE_TTWU layout is similar enough
690+
* for task_struct to be on the @call_single_queue.
691+
*/
692+
BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
693+
offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
694+
662695
idle_threads_init();
663696
cpuhp_threads_init();
664697

0 commit comments

Comments
 (0)