Skip to content

Commit d479c5a

Browse files
committed
Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The changes in this cycle are: - Optimize the task wakeup CPU selection logic, to improve scalability and reduce wakeup latency spikes - PELT enhancements - CFS bandwidth handling fixes - Optimize the wakeup path by remove rq->wake_list and replacing it with ->ttwu_pending - Optimize IPI cross-calls by making flush_smp_call_function_queue() process sync callbacks first. - Misc fixes and enhancements" * tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits) irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too sched/headers: Split out open-coded prototypes into kernel/sched/smp.h sched: Replace rq::wake_list sched: Add rq::ttwu_pending irq_work, smp: Allow irq_work on call_single_queue smp: Optimize send_call_function_single_ipi() smp: Move irq_work_run() out of flush_smp_call_function_queue() smp: Optimize flush_smp_call_function_queue() sched: Fix smp_call_function_single_async() usage for ILB sched/core: Offload wakee task activation if it the wakee is descheduling sched/core: Optimize ttwu() spinning on p->on_cpu sched: Defend cfs and rt bandwidth quota against overflow sched/cpuacct: Fix charge cpuacct.usage_sys sched/fair: Replace zero-length array with flexible-array sched/pelt: Sync util/runnable_sum with PELT window when propagating sched/cpuacct: Use __this_cpu_add() instead of this_cpu_ptr() sched/fair: Optimize enqueue_task_fair() sched: Make scheduler_ipi inline sched: Clean up scheduler_ipi() sched/core: Simplify sched_init() ...
2 parents f6aee50 + 25de110 commit d479c5a

File tree

21 files changed

+603
-408
lines changed

21 files changed

+603
-408
lines changed

arch/powerpc/platforms/powernv/smp.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
167167
/* Standard hot unplug procedure */
168168

169169
idle_task_exit();
170-
current->active_mm = NULL; /* for sanity */
171170
cpu = smp_processor_id();
172171
DBG("CPU%d offline\n", cpu);
173172
generic_set_cpu_dead(cpu);

include/linux/irq_work.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
* busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
1414
*/
1515

16+
/* flags share CSD_FLAG_ space */
17+
1618
#define IRQ_WORK_PENDING BIT(0)
1719
#define IRQ_WORK_BUSY BIT(1)
1820

@@ -23,9 +25,12 @@
2325

2426
#define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY)
2527

28+
/*
29+
* structure shares layout with single_call_data_t.
30+
*/
2631
struct irq_work {
27-
atomic_t flags;
2832
struct llist_node llnode;
33+
atomic_t flags;
2934
void (*func)(struct irq_work *);
3035
};
3136

@@ -53,9 +58,11 @@ void irq_work_sync(struct irq_work *work);
5358

5459
void irq_work_run(void);
5560
bool irq_work_needs_cpu(void);
61+
void irq_work_single(void *arg);
5662
#else
5763
static inline bool irq_work_needs_cpu(void) { return false; }
5864
static inline void irq_work_run(void) { }
65+
static inline void irq_work_single(void *arg) { }
5966
#endif
6067

6168
#endif /* _LINUX_IRQ_WORK_H */

include/linux/sched.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,7 @@ struct task_struct {
654654

655655
#ifdef CONFIG_SMP
656656
struct llist_node wake_entry;
657+
unsigned int wake_entry_type;
657658
int on_cpu;
658659
#ifdef CONFIG_THREAD_INFO_IN_TASK
659660
/* Current CPU: */
@@ -1730,7 +1731,15 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
17301731
})
17311732

17321733
#ifdef CONFIG_SMP
1733-
void scheduler_ipi(void);
1734+
static __always_inline void scheduler_ipi(void)
1735+
{
1736+
/*
1737+
* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1738+
* TIF_NEED_RESCHED remotely (for the first time) will also send
1739+
* this IPI.
1740+
*/
1741+
preempt_fold_need_resched();
1742+
}
17341743
extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
17351744
#else
17361745
static inline void scheduler_ipi(void) { }

include/linux/sched/mm.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
4949
__mmdrop(mm);
5050
}
5151

52+
void mmdrop(struct mm_struct *mm);
53+
5254
/*
5355
* This has to be called after a get_task_mm()/mmget_not_zero()
5456
* followed by taking the mmap_sem for writing before modifying the

include/linux/sched/topology.h

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,21 +11,20 @@
1111
*/
1212
#ifdef CONFIG_SMP
1313

14-
#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
15-
#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
16-
#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
17-
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
18-
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
19-
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
20-
#define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */
21-
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */
22-
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
23-
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */
24-
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
25-
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
26-
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
27-
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
28-
#define SD_NUMA 0x4000 /* cross-node balancing */
14+
#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */
15+
#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */
16+
#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */
17+
#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */
18+
#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */
19+
#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */
20+
#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */
21+
#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */
22+
#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */
23+
#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */
24+
#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */
25+
#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */
26+
#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */
27+
#define SD_NUMA 0x2000 /* cross-node balancing */
2928

3029
#ifdef CONFIG_SCHED_SMT
3130
static inline int cpu_smt_flags(void)

include/linux/smp.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,39 @@
1616

1717
typedef void (*smp_call_func_t)(void *info);
1818
typedef bool (*smp_cond_func_t)(int cpu, void *info);
19+
20+
enum {
21+
CSD_FLAG_LOCK = 0x01,
22+
23+
/* IRQ_WORK_flags */
24+
25+
CSD_TYPE_ASYNC = 0x00,
26+
CSD_TYPE_SYNC = 0x10,
27+
CSD_TYPE_IRQ_WORK = 0x20,
28+
CSD_TYPE_TTWU = 0x30,
29+
CSD_FLAG_TYPE_MASK = 0xF0,
30+
};
31+
32+
/*
33+
* structure shares (partial) layout with struct irq_work
34+
*/
1935
struct __call_single_data {
2036
struct llist_node llist;
37+
unsigned int flags;
2138
smp_call_func_t func;
2239
void *info;
23-
unsigned int flags;
2440
};
2541

2642
/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
2743
typedef struct __call_single_data call_single_data_t
2844
__aligned(sizeof(struct __call_single_data));
2945

46+
/*
47+
* Enqueue a llist_node on the call_single_queue; be very careful, read
48+
* flush_smp_call_function_queue() in detail.
49+
*/
50+
extern void __smp_call_single_queue(int cpu, struct llist_node *node);
51+
3052
/* total number of cpus in this system (may exceed NR_CPUS) */
3153
extern unsigned int total_cpus;
3254

include/linux/swait.h

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,10 @@
99
#include <asm/current.h>
1010

1111
/*
12-
* BROKEN wait-queues.
13-
*
14-
* These "simple" wait-queues are broken garbage, and should never be
15-
* used. The comments below claim that they are "similar" to regular
16-
* wait-queues, but the semantics are actually completely different, and
17-
* every single user we have ever had has been buggy (or pointless).
18-
*
19-
* A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
20-
* "wake_up()" does, and has led to problems. In other cases, it has
21-
* been fine, because there's only ever one waiter (kvm), but in that
22-
* case gthe whole "simple" wait-queue is just pointless to begin with,
23-
* since there is no "queue". Use "wake_up_process()" with a direct
24-
* pointer instead.
25-
*
26-
* While these are very similar to regular wait queues (wait.h) the most
27-
* important difference is that the simple waitqueue allows for deterministic
28-
* behaviour -- IOW it has strictly bounded IRQ and lock hold times.
12+
* Simple waitqueues are semantically very different to regular wait queues
13+
* (wait.h). The most important difference is that the simple waitqueue allows
14+
* for deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
15+
* times.
2916
*
3017
* Mainly, this is accomplished by two things. Firstly not allowing swake_up_all
3118
* from IRQ disabled, and dropping the lock upon every wakeup, giving a higher
@@ -39,7 +26,7 @@
3926
* sleeper state.
4027
*
4128
* - the !exclusive mode; because that leads to O(n) wakeups, everything is
42-
* exclusive.
29+
* exclusive. As such swake_up_one will only ever awake _one_ waiter.
4330
*
4431
* - custom wake callback functions; because you cannot give any guarantees
4532
* about random code. This also allows swait to be used in RT, such that

kernel/cpu.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
*
44
* This code is licenced under the GPL.
55
*/
6+
#include <linux/sched/mm.h>
67
#include <linux/proc_fs.h>
78
#include <linux/smp.h>
89
#include <linux/init.h>
@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
564565
return bringup_wait_for_ap(cpu);
565566
}
566567

568+
static int finish_cpu(unsigned int cpu)
569+
{
570+
struct task_struct *idle = idle_thread_get(cpu);
571+
struct mm_struct *mm = idle->active_mm;
572+
573+
/*
574+
* idle_task_exit() will have switched to &init_mm, now
575+
* clean up any remaining active_mm state.
576+
*/
577+
if (mm != &init_mm)
578+
idle->active_mm = &init_mm;
579+
mmdrop(mm);
580+
return 0;
581+
}
582+
567583
/*
568584
* Hotplug state machine related functions
569585
*/
@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
15491565
[CPUHP_BRINGUP_CPU] = {
15501566
.name = "cpu:bringup",
15511567
.startup.single = bringup_cpu,
1552-
.teardown.single = NULL,
1568+
.teardown.single = finish_cpu,
15531569
.cant_stop = true,
15541570
},
15551571
/* Final state before CPU kills itself */

kernel/exit.c

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -708,8 +708,12 @@ void __noreturn do_exit(long code)
708708
struct task_struct *tsk = current;
709709
int group_dead;
710710

711-
profile_task_exit(tsk);
712-
kcov_task_exit(tsk);
711+
/*
712+
* We can get here from a kernel oops, sometimes with preemption off.
713+
* Start by checking for critical errors.
714+
* Then fix up important state like USER_DS and preemption.
715+
* Then do everything else.
716+
*/
713717

714718
WARN_ON(blk_needs_flush_plug(tsk));
715719

@@ -727,6 +731,16 @@ void __noreturn do_exit(long code)
727731
*/
728732
set_fs(USER_DS);
729733

734+
if (unlikely(in_atomic())) {
735+
pr_info("note: %s[%d] exited with preempt_count %d\n",
736+
current->comm, task_pid_nr(current),
737+
preempt_count());
738+
preempt_count_set(PREEMPT_ENABLED);
739+
}
740+
741+
profile_task_exit(tsk);
742+
kcov_task_exit(tsk);
743+
730744
ptrace_event(PTRACE_EVENT_EXIT, code);
731745

732746
validate_creds_for_do_exit(tsk);
@@ -744,13 +758,6 @@ void __noreturn do_exit(long code)
744758

745759
exit_signals(tsk); /* sets PF_EXITING */
746760

747-
if (unlikely(in_atomic())) {
748-
pr_info("note: %s[%d] exited with preempt_count %d\n",
749-
current->comm, task_pid_nr(current),
750-
preempt_count());
751-
preempt_count_set(PREEMPT_ENABLED);
752-
}
753-
754761
/* sync mm's RSS info before statistics gathering */
755762
if (tsk->mm)
756763
sync_mm_rss(tsk->mm);

kernel/irq_work.c

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work)
3131
{
3232
int oflags;
3333

34-
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
34+
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
3535
/*
3636
* If the work is already pending, no need to raise the IPI.
3737
* The pairing atomic_fetch_andnot() in irq_work_run() makes sure
@@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
102102
if (cpu != smp_processor_id()) {
103103
/* Arch remote IPI send/receive backend aren't NMI safe */
104104
WARN_ON_ONCE(in_nmi());
105-
if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
106-
arch_send_call_function_single_ipi(cpu);
105+
__smp_call_single_queue(cpu, &work->llnode);
107106
} else {
108107
__irq_work_queue_local(work);
109108
}
@@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void)
131130
return true;
132131
}
133132

133+
void irq_work_single(void *arg)
134+
{
135+
struct irq_work *work = arg;
136+
int flags;
137+
138+
/*
139+
* Clear the PENDING bit, after this point the @work
140+
* can be re-used.
141+
* Make it immediately visible so that other CPUs trying
142+
* to claim that work don't rely on us to handle their data
143+
* while we are in the middle of the func.
144+
*/
145+
flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
146+
147+
lockdep_irq_work_enter(work);
148+
work->func(work);
149+
lockdep_irq_work_exit(work);
150+
/*
151+
* Clear the BUSY bit and return to the free state if
152+
* no-one else claimed it meanwhile.
153+
*/
154+
flags &= ~IRQ_WORK_PENDING;
155+
(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
156+
}
157+
134158
static void irq_work_run_list(struct llist_head *list)
135159
{
136160
struct irq_work *work, *tmp;
@@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list)
142166
return;
143167

144168
llnode = llist_del_all(list);
145-
llist_for_each_entry_safe(work, tmp, llnode, llnode) {
146-
int flags;
147-
/*
148-
* Clear the PENDING bit, after this point the @work
149-
* can be re-used.
150-
* Make it immediately visible so that other CPUs trying
151-
* to claim that work don't rely on us to handle their data
152-
* while we are in the middle of the func.
153-
*/
154-
flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
155-
156-
lockdep_irq_work_enter(work);
157-
work->func(work);
158-
lockdep_irq_work_exit(work);
159-
/*
160-
* Clear the BUSY bit and return to the free state if
161-
* no-one else claimed it meanwhile.
162-
*/
163-
flags &= ~IRQ_WORK_PENDING;
164-
(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
165-
}
169+
llist_for_each_entry_safe(work, tmp, llnode, llnode)
170+
irq_work_single(work);
166171
}
167172

168173
/*

0 commit comments

Comments
 (0)