Skip to content

Commit ef78e5b

Browse files
committed
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Misc fixes all over the place: - Fix NUMA over-balancing between lightly loaded nodes. This is fallout of the big load-balancer rewrite. - Fix the NOHZ remote loadavg update logic, which fixes anomalies like reported 150 loadavg on mostly idle CPUs. - Fix XFS performance/scalability - Fix throttled groups unbound task-execution bug - Fix PSI procfs boundary condition - Fix the cpu.uclamp.{min,max} cgroup configuration write checks - Fix DocBook annotations - Fix RCU annotations - Fix overly CPU-intensive housekeeper CPU logic loop on large CPU counts" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Fix kernel-doc warning in attach_entity_load_avg() sched/core: Annotate curr pointer in rq with __rcu sched/psi: Fix OOB write when writing 0 bytes to PSI files sched/fair: Allow a per-CPU kthread waking a task to stack on the same CPU, to fix XFS performance regression sched/fair: Prevent unlimited runtime on throttled group sched/nohz: Optimize get_nohz_timer_target() sched/uclamp: Reject negative values in cpu_uclamp_write() sched/fair: Allow a small load imbalance between low utilisation SD_NUMA domains timers/nohz: Update NOHZ load in remote tick sched/core: Don't skip remote tick for idle CPUs
2 parents da99f93 + e9f5490 commit ef78e5b

File tree

6 files changed

+119
-53
lines changed

6 files changed

+119
-53
lines changed

include/linux/sched/nohz.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@ static inline void nohz_balance_enter_idle(int cpu) { }
1515

1616
#ifdef CONFIG_NO_HZ_COMMON
1717
void calc_load_nohz_start(void);
18+
void calc_load_nohz_remote(struct rq *rq);
1819
void calc_load_nohz_stop(void);
1920
#else
2021
static inline void calc_load_nohz_start(void) { }
22+
static inline void calc_load_nohz_remote(struct rq *rq) { }
2123
static inline void calc_load_nohz_stop(void) { }
2224
#endif /* CONFIG_NO_HZ_COMMON */
2325

kernel/sched/core.c

Lines changed: 34 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -552,27 +552,32 @@ void resched_cpu(int cpu)
552552
*/
553553
int get_nohz_timer_target(void)
554554
{
555-
int i, cpu = smp_processor_id();
555+
int i, cpu = smp_processor_id(), default_cpu = -1;
556556
struct sched_domain *sd;
557557

558-
if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
559-
return cpu;
558+
if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
559+
if (!idle_cpu(cpu))
560+
return cpu;
561+
default_cpu = cpu;
562+
}
560563

561564
rcu_read_lock();
562565
for_each_domain(cpu, sd) {
563-
for_each_cpu(i, sched_domain_span(sd)) {
566+
for_each_cpu_and(i, sched_domain_span(sd),
567+
housekeeping_cpumask(HK_FLAG_TIMER)) {
564568
if (cpu == i)
565569
continue;
566570

567-
if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
571+
if (!idle_cpu(i)) {
568572
cpu = i;
569573
goto unlock;
570574
}
571575
}
572576
}
573577

574-
if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
575-
cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
578+
if (default_cpu == -1)
579+
default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
580+
cpu = default_cpu;
576581
unlock:
577582
rcu_read_unlock();
578583
return cpu;
@@ -1442,17 +1447,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
14421447

14431448
#ifdef CONFIG_SMP
14441449

1445-
static inline bool is_per_cpu_kthread(struct task_struct *p)
1446-
{
1447-
if (!(p->flags & PF_KTHREAD))
1448-
return false;
1449-
1450-
if (p->nr_cpus_allowed != 1)
1451-
return false;
1452-
1453-
return true;
1454-
}
1455-
14561450
/*
14571451
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
14581452
* __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -3669,28 +3663,32 @@ static void sched_tick_remote(struct work_struct *work)
36693663
* statistics and checks timeslices in a time-independent way, regardless
36703664
* of when exactly it is running.
36713665
*/
3672-
if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
3666+
if (!tick_nohz_tick_stopped_cpu(cpu))
36733667
goto out_requeue;
36743668

36753669
rq_lock_irq(rq, &rf);
36763670
curr = rq->curr;
3677-
if (is_idle_task(curr) || cpu_is_offline(cpu))
3671+
if (cpu_is_offline(cpu))
36783672
goto out_unlock;
36793673

3674+
curr = rq->curr;
36803675
update_rq_clock(rq);
3681-
delta = rq_clock_task(rq) - curr->se.exec_start;
36823676

3683-
/*
3684-
* Make sure the next tick runs within a reasonable
3685-
* amount of time.
3686-
*/
3687-
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3677+
if (!is_idle_task(curr)) {
3678+
/*
3679+
* Make sure the next tick runs within a reasonable
3680+
* amount of time.
3681+
*/
3682+
delta = rq_clock_task(rq) - curr->se.exec_start;
3683+
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3684+
}
36883685
curr->sched_class->task_tick(rq, curr, 0);
36893686

3687+
calc_load_nohz_remote(rq);
36903688
out_unlock:
36913689
rq_unlock_irq(rq, &rf);
3692-
36933690
out_requeue:
3691+
36943692
/*
36953693
* Run the remote tick once per second (1Hz). This arbitrary
36963694
* frequency is large enough to avoid overload but short enough
@@ -7063,8 +7061,15 @@ void sched_move_task(struct task_struct *tsk)
70637061

70647062
if (queued)
70657063
enqueue_task(rq, tsk, queue_flags);
7066-
if (running)
7064+
if (running) {
70677065
set_next_task(rq, tsk);
7066+
/*
7067+
* After changing group, the running task may have joined a
7068+
* throttled one but it's still the running task. Trigger a
7069+
* resched to make sure that task can still run.
7070+
*/
7071+
resched_curr(rq);
7072+
}
70687073

70697074
task_rq_unlock(rq, tsk, &rf);
70707075
}
@@ -7260,7 +7265,7 @@ capacity_from_percent(char *buf)
72607265
&req.percent);
72617266
if (req.ret)
72627267
return req;
7263-
if (req.percent > UCLAMP_PERCENT_SCALE) {
7268+
if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
72647269
req.ret = -ERANGE;
72657270
return req;
72667271
}

kernel/sched/fair.c

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3516,7 +3516,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
35163516
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
35173517
* @cfs_rq: cfs_rq to attach to
35183518
* @se: sched_entity to attach
3519-
* @flags: migration hints
35203519
*
35213520
* Must call update_cfs_rq_load_avg() before this, since we rely on
35223521
* cfs_rq->avg.last_update_time being current.
@@ -5912,6 +5911,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
59125911
(available_idle_cpu(prev) || sched_idle_cpu(prev)))
59135912
return prev;
59145913

5914+
/*
5915+
* Allow a per-cpu kthread to stack with the wakee if the
5916+
* kworker thread and the tasks previous CPUs are the same.
5917+
* The assumption is that the wakee queued work for the
5918+
* per-cpu kthread that is now complete and the wakeup is
5919+
* essentially a sync wakeup. An obvious example of this
5920+
* pattern is IO completions.
5921+
*/
5922+
if (is_per_cpu_kthread(current) &&
5923+
prev == smp_processor_id() &&
5924+
this_rq()->nr_running <= 1) {
5925+
return prev;
5926+
}
5927+
59155928
/* Check a recently used CPU as a potential idle candidate: */
59165929
recent_used_cpu = p->recent_used_cpu;
59175930
if (recent_used_cpu != prev &&
@@ -8658,10 +8671,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
86588671
/*
86598672
* Try to use spare capacity of local group without overloading it or
86608673
* emptying busiest.
8661-
* XXX Spreading tasks across NUMA nodes is not always the best policy
8662-
* and special care should be taken for SD_NUMA domain level before
8663-
* spreading the tasks. For now, load_balance() fully relies on
8664-
* NUMA_BALANCING and fbq_classify_group/rq to override the decision.
86658674
*/
86668675
if (local->group_type == group_has_spare) {
86678676
if (busiest->group_type > group_fully_busy) {
@@ -8701,16 +8710,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
87018710
env->migration_type = migrate_task;
87028711
lsub_positive(&nr_diff, local->sum_nr_running);
87038712
env->imbalance = nr_diff >> 1;
8704-
return;
8705-
}
8713+
} else {
87068714

8707-
/*
8708-
* If there is no overload, we just want to even the number of
8709-
* idle cpus.
8710-
*/
8711-
env->migration_type = migrate_task;
8712-
env->imbalance = max_t(long, 0, (local->idle_cpus -
8715+
/*
8716+
* If there is no overload, we just want to even the number of
8717+
* idle cpus.
8718+
*/
8719+
env->migration_type = migrate_task;
8720+
env->imbalance = max_t(long, 0, (local->idle_cpus -
87138721
busiest->idle_cpus) >> 1);
8722+
}
8723+
8724+
/* Consider allowing a small imbalance between NUMA groups */
8725+
if (env->sd->flags & SD_NUMA) {
8726+
unsigned int imbalance_min;
8727+
8728+
/*
8729+
* Compute an allowed imbalance based on a simple
8730+
* pair of communicating tasks that should remain
8731+
* local and ignore them.
8732+
*
8733+
* NOTE: Generally this would have been based on
8734+
* the domain size and this was evaluated. However,
8735+
* the benefit is similar across a range of workloads
8736+
* and machines but scaling by the domain size adds
8737+
* the risk that lower domains have to be rebalanced.
8738+
*/
8739+
imbalance_min = 2;
8740+
if (busiest->sum_nr_running <= imbalance_min)
8741+
env->imbalance = 0;
8742+
}
8743+
87148744
return;
87158745
}
87168746

kernel/sched/loadavg.c

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -231,23 +231,36 @@ static inline int calc_load_read_idx(void)
231231
return calc_load_idx & 1;
232232
}
233233

234-
void calc_load_nohz_start(void)
234+
static void calc_load_nohz_fold(struct rq *rq)
235235
{
236-
struct rq *this_rq = this_rq();
237236
long delta;
238237

239-
/*
240-
* We're going into NO_HZ mode, if there's any pending delta, fold it
241-
* into the pending NO_HZ delta.
242-
*/
243-
delta = calc_load_fold_active(this_rq, 0);
238+
delta = calc_load_fold_active(rq, 0);
244239
if (delta) {
245240
int idx = calc_load_write_idx();
246241

247242
atomic_long_add(delta, &calc_load_nohz[idx]);
248243
}
249244
}
250245

246+
void calc_load_nohz_start(void)
247+
{
248+
/*
249+
* We're going into NO_HZ mode, if there's any pending delta, fold it
250+
* into the pending NO_HZ delta.
251+
*/
252+
calc_load_nohz_fold(this_rq());
253+
}
254+
255+
/*
256+
* Keep track of the load for NOHZ_FULL, must be called between
257+
* calc_load_nohz_{start,stop}().
258+
*/
259+
void calc_load_nohz_remote(struct rq *rq)
260+
{
261+
calc_load_nohz_fold(rq);
262+
}
263+
251264
void calc_load_nohz_stop(void)
252265
{
253266
struct rq *this_rq = this_rq();
@@ -268,7 +281,7 @@ void calc_load_nohz_stop(void)
268281
this_rq->calc_load_update += LOAD_FREQ;
269282
}
270283

271-
static long calc_load_nohz_fold(void)
284+
static long calc_load_nohz_read(void)
272285
{
273286
int idx = calc_load_read_idx();
274287
long delta = 0;
@@ -323,7 +336,7 @@ static void calc_global_nohz(void)
323336
}
324337
#else /* !CONFIG_NO_HZ_COMMON */
325338

326-
static inline long calc_load_nohz_fold(void) { return 0; }
339+
static inline long calc_load_nohz_read(void) { return 0; }
327340
static inline void calc_global_nohz(void) { }
328341

329342
#endif /* CONFIG_NO_HZ_COMMON */
@@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks)
346359
/*
347360
* Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
348361
*/
349-
delta = calc_load_nohz_fold();
362+
delta = calc_load_nohz_read();
350363
if (delta)
351364
atomic_long_add(delta, &calc_load_tasks);
352365

kernel/sched/psi.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1199,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
11991199
if (static_branch_likely(&psi_disabled))
12001200
return -EOPNOTSUPP;
12011201

1202+
if (!nbytes)
1203+
return -EINVAL;
1204+
12021205
buf_size = min(nbytes, sizeof(buf));
12031206
if (copy_from_user(buf, user_buf, buf_size))
12041207
return -EFAULT;

kernel/sched/sched.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -896,7 +896,7 @@ struct rq {
896896
*/
897897
unsigned long nr_uninterruptible;
898898

899-
struct task_struct *curr;
899+
struct task_struct __rcu *curr;
900900
struct task_struct *idle;
901901
struct task_struct *stop;
902902
unsigned long next_balance;
@@ -2479,3 +2479,16 @@ static inline void membarrier_switch_mm(struct rq *rq,
24792479
{
24802480
}
24812481
#endif
2482+
2483+
#ifdef CONFIG_SMP
2484+
static inline bool is_per_cpu_kthread(struct task_struct *p)
2485+
{
2486+
if (!(p->flags & PF_KTHREAD))
2487+
return false;
2488+
2489+
if (p->nr_cpus_allowed != 1)
2490+
return false;
2491+
2492+
return true;
2493+
}
2494+
#endif

0 commit comments

Comments
 (0)