Skip to content

Commit 9819f68

Browse files
committed
Merge tag 'sched-urgent-2021-05-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Thomas Gleixner: "A set of scheduler updates: - Prevent PSI state corruption when schedule() races with cgroup move. A recent commit combined two PSI callbacks to reduce the number of cgroup tree updates, but missed that schedule() can drop rq::lock for load balancing, which opens the race window for cgroup_move_task() which then observes half updated state. The fix is to solely use task::ps_flags instead of looking at the potentially mismatching scheduler state - Prevent an out-of-bounds access in uclamp caused bu a rounding division which can lead to an off-by-one error exceeding the buckets array size. - Prevent unfairness caused by missing load decay when a task is attached to a cfs runqueue. The old load of the task was attached to the runqueue and never removed. Fix it by enforcing the load update through the hierarchy for unthrottled run queue instances. - A documentation fix fot the 'sched_verbose' command line option" * tag 'sched-urgent-2021-05-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Fix unfairness caused by missing load decay sched: Fix out-of-bound access in uclamp psi: Fix psi state corruption when schedule() races with cgroup move sched,doc: sched_debug_verbose cmdline should be sched_verbose
2 parents 732a27a + 0258bdf commit 9819f68

File tree

4 files changed

+37
-15
lines changed

4 files changed

+37
-15
lines changed

Documentation/scheduler/sched-domains.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ for a given topology level by creating a sched_domain_topology_level array and
7474
calling set_sched_topology() with this array as the parameter.
7575

7676
The sched-domains debugging infrastructure can be enabled by enabling
77-
CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you
77+
CONFIG_SCHED_DEBUG and adding 'sched_verbose' to your cmdline. If you
7878
forgot to tweak your cmdline, you can also flip the
7979
/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of
8080
the sched domains which should catch most possible errors (described above). It

kernel/sched/core.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -938,7 +938,7 @@ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
938938

939939
static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
940940
{
941-
return clamp_value / UCLAMP_BUCKET_DELTA;
941+
return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
942942
}
943943

944944
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)

kernel/sched/fair.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10878,16 +10878,22 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
1087810878
{
1087910879
struct cfs_rq *cfs_rq;
1088010880

10881+
list_add_leaf_cfs_rq(cfs_rq_of(se));
10882+
1088110883
/* Start to propagate at parent */
1088210884
se = se->parent;
1088310885

1088410886
for_each_sched_entity(se) {
1088510887
cfs_rq = cfs_rq_of(se);
1088610888

10887-
if (cfs_rq_throttled(cfs_rq))
10888-
break;
10889+
if (!cfs_rq_throttled(cfs_rq)){
10890+
update_load_avg(cfs_rq, se, UPDATE_TG);
10891+
list_add_leaf_cfs_rq(cfs_rq);
10892+
continue;
10893+
}
1088910894

10890-
update_load_avg(cfs_rq, se, UPDATE_TG);
10895+
if (list_add_leaf_cfs_rq(cfs_rq))
10896+
break;
1089110897
}
1089210898
}
1089310899
#else

kernel/sched/psi.c

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -972,7 +972,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
972972
*/
973973
void cgroup_move_task(struct task_struct *task, struct css_set *to)
974974
{
975-
unsigned int task_flags = 0;
975+
unsigned int task_flags;
976976
struct rq_flags rf;
977977
struct rq *rq;
978978

@@ -987,15 +987,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
987987

988988
rq = task_rq_lock(task, &rf);
989989

990-
if (task_on_rq_queued(task)) {
991-
task_flags = TSK_RUNNING;
992-
if (task_current(rq, task))
993-
task_flags |= TSK_ONCPU;
994-
} else if (task->in_iowait)
995-
task_flags = TSK_IOWAIT;
996-
997-
if (task->in_memstall)
998-
task_flags |= TSK_MEMSTALL;
990+
/*
991+
* We may race with schedule() dropping the rq lock between
992+
* deactivating prev and switching to next. Because the psi
993+
* updates from the deactivation are deferred to the switch
994+
* callback to save cgroup tree updates, the task's scheduling
995+
* state here is not coherent with its psi state:
996+
*
997+
* schedule() cgroup_move_task()
998+
* rq_lock()
999+
* deactivate_task()
1000+
* p->on_rq = 0
1001+
* psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
1002+
* pick_next_task()
1003+
* rq_unlock()
1004+
* rq_lock()
1005+
* psi_task_change() // old cgroup
1006+
* task->cgroups = to
1007+
* psi_task_change() // new cgroup
1008+
* rq_unlock()
1009+
* rq_lock()
1010+
* psi_sched_switch() // does deferred updates in new cgroup
1011+
*
1012+
* Don't rely on the scheduling state. Use psi_flags instead.
1013+
*/
1014+
task_flags = task->psi_flags;
9991015

10001016
if (task_flags)
10011017
psi_task_change(task, task_flags, 0);

0 commit comments

Comments
 (0)