Skip to content

Commit 16d51a5

Browse files
thejhIngo Molnar
authored andcommitted
sched/fair: Don't free p->numa_faults with concurrent readers
When going through execve(), zero out the NUMA fault statistics instead of freeing them. During execve, the task is reachable through procfs and the scheduler. A concurrent /proc/*/sched reader can read data from a freed ->numa_faults allocation (confirmed by KASAN) and write it back to userspace. I believe that it would also be possible for a use-after-free read to occur through a race between a NUMA fault and execve(): task_numa_fault() can lead to task_numa_compare(), which invokes task_weight() on the currently running task of a different CPU. Another way to fix this would be to make ->numa_faults RCU-managed or add extra locking, but it seems easier to wipe the NUMA fault statistics on execve. Signed-off-by: Jann Horn <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Petr Mladek <[email protected]> Cc: Sergey Senozhatsky <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Will Deacon <[email protected]> Fixes: 8272701 ("sched/numa: Call task_numa_free() from do_execve()") Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent 7b5cf70 commit 16d51a5

File tree

4 files changed

+24
-8
lines changed

4 files changed

+24
-8
lines changed

fs/exec.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1828,7 +1828,7 @@ static int __do_execve_file(int fd, struct filename *filename,
18281828
membarrier_execve(current);
18291829
rseq_execve(current);
18301830
acct_update_integrals(current);
1831-
task_numa_free(current);
1831+
task_numa_free(current, false);
18321832
free_bprm(bprm);
18331833
kfree(pathbuf);
18341834
if (filename)

include/linux/sched/numa_balancing.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
extern void task_numa_fault(int last_node, int node, int pages, int flags);
2020
extern pid_t task_numa_group_id(struct task_struct *p);
2121
extern void set_numabalancing_state(bool enabled);
22-
extern void task_numa_free(struct task_struct *p);
22+
extern void task_numa_free(struct task_struct *p, bool final);
2323
extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
2424
int src_nid, int dst_cpu);
2525
#else
@@ -34,7 +34,7 @@ static inline pid_t task_numa_group_id(struct task_struct *p)
3434
static inline void set_numabalancing_state(bool enabled)
3535
{
3636
}
37-
static inline void task_numa_free(struct task_struct *p)
37+
static inline void task_numa_free(struct task_struct *p, bool final)
3838
{
3939
}
4040
static inline bool should_numa_migrate_memory(struct task_struct *p,

kernel/fork.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -726,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk)
726726
WARN_ON(tsk == current);
727727

728728
cgroup_free(tsk);
729-
task_numa_free(tsk);
729+
task_numa_free(tsk, true);
730730
security_task_free(tsk);
731731
exit_creds(tsk);
732732
delayacct_tsk_free(tsk);

kernel/sched/fair.c

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2353,13 +2353,23 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
23532353
return;
23542354
}
23552355

2356-
void task_numa_free(struct task_struct *p)
2356+
/*
2357+
* Get rid of NUMA staticstics associated with a task (either current or dead).
2358+
* If @final is set, the task is dead and has reached refcount zero, so we can
2359+
* safely free all relevant data structures. Otherwise, there might be
2360+
* concurrent reads from places like load balancing and procfs, and we should
2361+
* reset the data back to default state without freeing ->numa_faults.
2362+
*/
2363+
void task_numa_free(struct task_struct *p, bool final)
23572364
{
23582365
struct numa_group *grp = p->numa_group;
2359-
void *numa_faults = p->numa_faults;
2366+
unsigned long *numa_faults = p->numa_faults;
23602367
unsigned long flags;
23612368
int i;
23622369

2370+
if (!numa_faults)
2371+
return;
2372+
23632373
if (grp) {
23642374
spin_lock_irqsave(&grp->lock, flags);
23652375
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
@@ -2372,8 +2382,14 @@ void task_numa_free(struct task_struct *p)
23722382
put_numa_group(grp);
23732383
}
23742384

2375-
p->numa_faults = NULL;
2376-
kfree(numa_faults);
2385+
if (final) {
2386+
p->numa_faults = NULL;
2387+
kfree(numa_faults);
2388+
} else {
2389+
p->total_numa_faults = 0;
2390+
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2391+
numa_faults[i] = 0;
2392+
}
23772393
}
23782394

23792395
/*

0 commit comments

Comments
 (0)