Skip to content

Commit 227a4aa

Browse files
compudjIngo Molnar
authored andcommitted
sched/membarrier: Fix p->mm->membarrier_state racy load
The membarrier_state field is located within the mm_struct, which is not guaranteed to exist when used from runqueue-lock-free iteration on runqueues by the membarrier system call. Copy the membarrier_state from the mm_struct into the scheduler runqueue when the scheduler switches between mm. When registering membarrier for mm, after setting the registration bit in the mm membarrier state, issue a synchronize_rcu() to ensure the scheduler observes the change. In order to take care of the case where a runqueue keeps executing the target mm without swapping to other mm, iterate over each runqueue and issue an IPI to copy the membarrier_state from the mm_struct into each runqueue which have the same mm which state has just been modified. Move the mm membarrier_state field closer to pgd in mm_struct to use a cache line already touched by the scheduler switch_mm. The membarrier_execve() (now membarrier_exec_mmap) hook now needs to clear the runqueue's membarrier state in addition to clear the mm membarrier state, so move its implementation into the scheduler membarrier code so it can access the runqueue structure. Add memory barrier in membarrier_exec_mmap() prior to clearing the membarrier state, ensuring memory accesses executed prior to exec are not reordered with the stores clearing the membarrier state. As suggested by Linus, move all membarrier.c RCU read-side locks outside of the for each cpu loops. Suggested-by: Linus Torvalds <[email protected]> Signed-off-by: Mathieu Desnoyers <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Cc: Chris Metcalf <[email protected]> Cc: Christoph Lameter <[email protected]> Cc: Eric W. Biederman <[email protected]> Cc: Kirill Tkhai <[email protected]> Cc: Mike Galbraith <[email protected]> Cc: Oleg Nesterov <[email protected]> Cc: Paul E. McKenney <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Russell King - ARM Linux admin <[email protected]> Cc: Thomas Gleixner <[email protected]> Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent 2840cf0 commit 227a4aa

File tree

6 files changed

+183
-54
lines changed

6 files changed

+183
-54
lines changed

fs/exec.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm)
10331033
}
10341034
task_lock(tsk);
10351035
active_mm = tsk->active_mm;
1036+
membarrier_exec_mmap(mm);
10361037
tsk->mm = mm;
10371038
tsk->active_mm = mm;
10381039
activate_mm(active_mm, mm);
@@ -1825,7 +1826,6 @@ static int __do_execve_file(int fd, struct filename *filename,
18251826
/* execve succeeded */
18261827
current->fs->in_exec = 0;
18271828
current->in_execve = 0;
1828-
membarrier_execve(current);
18291829
rseq_execve(current);
18301830
acct_update_integrals(current);
18311831
task_numa_free(current, false);

include/linux/mm_types.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,16 @@ struct mm_struct {
383383
unsigned long highest_vm_end; /* highest vma end address */
384384
pgd_t * pgd;
385385

386+
#ifdef CONFIG_MEMBARRIER
387+
/**
388+
* @membarrier_state: Flags controlling membarrier behavior.
389+
*
390+
* This field is close to @pgd to hopefully fit in the same
391+
* cache-line, which needs to be touched by switch_mm().
392+
*/
393+
atomic_t membarrier_state;
394+
#endif
395+
386396
/**
387397
* @mm_users: The number of users including userspace.
388398
*
@@ -452,9 +462,7 @@ struct mm_struct {
452462
unsigned long flags; /* Must use atomic bitops to access */
453463

454464
struct core_state *core_state; /* coredumping support */
455-
#ifdef CONFIG_MEMBARRIER
456-
atomic_t membarrier_state;
457-
#endif
465+
458466
#ifdef CONFIG_AIO
459467
spinlock_t ioctx_lock;
460468
struct kioctx_table __rcu *ioctx_table;

include/linux/sched/mm.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -370,10 +370,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
370370
sync_core_before_usermode();
371371
}
372372

373-
static inline void membarrier_execve(struct task_struct *t)
374-
{
375-
atomic_set(&t->mm->membarrier_state, 0);
376-
}
373+
extern void membarrier_exec_mmap(struct mm_struct *mm);
374+
377375
#else
378376
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
379377
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -382,7 +380,7 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
382380
{
383381
}
384382
#endif
385-
static inline void membarrier_execve(struct task_struct *t)
383+
static inline void membarrier_exec_mmap(struct mm_struct *mm)
386384
{
387385
}
388386
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)

kernel/sched/core.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
33583358
else
33593359
prev->active_mm = NULL;
33603360
} else { // to user
3361+
membarrier_switch_mm(rq, prev->active_mm, next->mm);
33613362
/*
33623363
* sys_membarrier() requires an smp_mb() between setting
3363-
* rq->curr and returning to userspace.
3364+
* rq->curr / membarrier_switch_mm() and returning to userspace.
33643365
*
33653366
* The below provides this either through switch_mm(), or in
33663367
* case 'prev->active_mm == next->mm' through
33673368
* finish_task_switch()'s mmdrop().
33683369
*/
3369-
33703370
switch_mm_irqs_off(prev->active_mm, next->mm, next);
33713371

33723372
if (!prev->mm) { // from kernel

kernel/sched/membarrier.c

Lines changed: 132 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,39 @@ static void ipi_mb(void *info)
3030
smp_mb(); /* IPIs should be serializing but paranoid. */
3131
}
3232

33+
static void ipi_sync_rq_state(void *info)
34+
{
35+
struct mm_struct *mm = (struct mm_struct *) info;
36+
37+
if (current->mm != mm)
38+
return;
39+
this_cpu_write(runqueues.membarrier_state,
40+
atomic_read(&mm->membarrier_state));
41+
/*
42+
* Issue a memory barrier after setting
43+
* MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
44+
* guarantee that no memory access following registration is reordered
45+
* before registration.
46+
*/
47+
smp_mb();
48+
}
49+
50+
void membarrier_exec_mmap(struct mm_struct *mm)
51+
{
52+
/*
53+
* Issue a memory barrier before clearing membarrier_state to
54+
* guarantee that no memory access prior to exec is reordered after
55+
* clearing this state.
56+
*/
57+
smp_mb();
58+
atomic_set(&mm->membarrier_state, 0);
59+
/*
60+
* Keep the runqueue membarrier_state in sync with this mm
61+
* membarrier_state.
62+
*/
63+
this_cpu_write(runqueues.membarrier_state, 0);
64+
}
65+
3366
static int membarrier_global_expedited(void)
3467
{
3568
int cpu;
@@ -56,6 +89,7 @@ static int membarrier_global_expedited(void)
5689
}
5790

5891
cpus_read_lock();
92+
rcu_read_lock();
5993
for_each_online_cpu(cpu) {
6094
struct task_struct *p;
6195

@@ -70,17 +104,25 @@ static int membarrier_global_expedited(void)
70104
if (cpu == raw_smp_processor_id())
71105
continue;
72106

73-
rcu_read_lock();
107+
if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
108+
MEMBARRIER_STATE_GLOBAL_EXPEDITED))
109+
continue;
110+
111+
/*
112+
* Skip the CPU if it runs a kernel thread. The scheduler
113+
* leaves the prior task mm in place as an optimization when
114+
* scheduling a kthread.
115+
*/
74116
p = rcu_dereference(cpu_rq(cpu)->curr);
75-
if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
76-
MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
77-
if (!fallback)
78-
__cpumask_set_cpu(cpu, tmpmask);
79-
else
80-
smp_call_function_single(cpu, ipi_mb, NULL, 1);
81-
}
82-
rcu_read_unlock();
117+
if (p->flags & PF_KTHREAD)
118+
continue;
119+
120+
if (!fallback)
121+
__cpumask_set_cpu(cpu, tmpmask);
122+
else
123+
smp_call_function_single(cpu, ipi_mb, NULL, 1);
83124
}
125+
rcu_read_unlock();
84126
if (!fallback) {
85127
preempt_disable();
86128
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
@@ -136,6 +178,7 @@ static int membarrier_private_expedited(int flags)
136178
}
137179

138180
cpus_read_lock();
181+
rcu_read_lock();
139182
for_each_online_cpu(cpu) {
140183
struct task_struct *p;
141184

@@ -157,8 +200,8 @@ static int membarrier_private_expedited(int flags)
157200
else
158201
smp_call_function_single(cpu, ipi_mb, NULL, 1);
159202
}
160-
rcu_read_unlock();
161203
}
204+
rcu_read_unlock();
162205
if (!fallback) {
163206
preempt_disable();
164207
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
@@ -177,32 +220,78 @@ static int membarrier_private_expedited(int flags)
177220
return 0;
178221
}
179222

223+
static int sync_runqueues_membarrier_state(struct mm_struct *mm)
224+
{
225+
int membarrier_state = atomic_read(&mm->membarrier_state);
226+
cpumask_var_t tmpmask;
227+
int cpu;
228+
229+
if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
230+
this_cpu_write(runqueues.membarrier_state, membarrier_state);
231+
232+
/*
233+
* For single mm user, we can simply issue a memory barrier
234+
* after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
235+
* mm and in the current runqueue to guarantee that no memory
236+
* access following registration is reordered before
237+
* registration.
238+
*/
239+
smp_mb();
240+
return 0;
241+
}
242+
243+
if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
244+
return -ENOMEM;
245+
246+
/*
247+
* For mm with multiple users, we need to ensure all future
248+
* scheduler executions will observe @mm's new membarrier
249+
* state.
250+
*/
251+
synchronize_rcu();
252+
253+
/*
254+
* For each cpu runqueue, if the task's mm match @mm, ensure that all
255+
* @mm's membarrier state set bits are also set in in the runqueue's
256+
* membarrier state. This ensures that a runqueue scheduling
257+
* between threads which are users of @mm has its membarrier state
258+
* updated.
259+
*/
260+
cpus_read_lock();
261+
rcu_read_lock();
262+
for_each_online_cpu(cpu) {
263+
struct rq *rq = cpu_rq(cpu);
264+
struct task_struct *p;
265+
266+
p = rcu_dereference(&rq->curr);
267+
if (p && p->mm == mm)
268+
__cpumask_set_cpu(cpu, tmpmask);
269+
}
270+
rcu_read_unlock();
271+
272+
preempt_disable();
273+
smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
274+
preempt_enable();
275+
276+
free_cpumask_var(tmpmask);
277+
cpus_read_unlock();
278+
279+
return 0;
280+
}
281+
180282
static int membarrier_register_global_expedited(void)
181283
{
182284
struct task_struct *p = current;
183285
struct mm_struct *mm = p->mm;
286+
int ret;
184287

185288
if (atomic_read(&mm->membarrier_state) &
186289
MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
187290
return 0;
188291
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
189-
if (atomic_read(&mm->mm_users) == 1) {
190-
/*
191-
* For single mm user, single threaded process, we can
192-
* simply issue a memory barrier after setting
193-
* MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
194-
* no memory access following registration is reordered
195-
* before registration.
196-
*/
197-
smp_mb();
198-
} else {
199-
/*
200-
* For multi-mm user threads, we need to ensure all
201-
* future scheduler executions will observe the new
202-
* thread flag state for this mm.
203-
*/
204-
synchronize_rcu();
205-
}
292+
ret = sync_runqueues_membarrier_state(mm);
293+
if (ret)
294+
return ret;
206295
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
207296
&mm->membarrier_state);
208297

@@ -213,33 +302,31 @@ static int membarrier_register_private_expedited(int flags)
213302
{
214303
struct task_struct *p = current;
215304
struct mm_struct *mm = p->mm;
216-
int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
305+
int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
306+
set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
307+
ret;
217308

218309
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
219310
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
220311
return -EINVAL;
221-
state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
312+
ready_state =
313+
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
222314
}
223315

224316
/*
225317
* We need to consider threads belonging to different thread
226318
* groups, which use the same mm. (CLONE_VM but not
227319
* CLONE_THREAD).
228320
*/
229-
if ((atomic_read(&mm->membarrier_state) & state) == state)
321+
if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
230322
return 0;
231-
atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
232323
if (flags & MEMBARRIER_FLAG_SYNC_CORE)
233-
atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
234-
&mm->membarrier_state);
235-
if (atomic_read(&mm->mm_users) != 1) {
236-
/*
237-
* Ensure all future scheduler executions will observe the
238-
* new thread flag state for this process.
239-
*/
240-
synchronize_rcu();
241-
}
242-
atomic_or(state, &mm->membarrier_state);
324+
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
325+
atomic_or(set_state, &mm->membarrier_state);
326+
ret = sync_runqueues_membarrier_state(mm);
327+
if (ret)
328+
return ret;
329+
atomic_or(ready_state, &mm->membarrier_state);
243330

244331
return 0;
245332
}
@@ -253,8 +340,10 @@ static int membarrier_register_private_expedited(int flags)
253340
* command specified does not exist, not available on the running
254341
* kernel, or if the command argument is invalid, this system call
255342
* returns -EINVAL. For a given command, with flags argument set to 0,
256-
* this system call is guaranteed to always return the same value until
257-
* reboot.
343+
* if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
344+
* always return the same value until reboot. In addition, it can return
345+
* -ENOMEM if there is not enough memory available to perform the system
346+
* call.
258347
*
259348
* All memory accesses performed in program order from each targeted thread
260349
* is guaranteed to be ordered with respect to sys_membarrier(). If we use

kernel/sched/sched.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -911,6 +911,10 @@ struct rq {
911911

912912
atomic_t nr_iowait;
913913

914+
#ifdef CONFIG_MEMBARRIER
915+
int membarrier_state;
916+
#endif
917+
914918
#ifdef CONFIG_SMP
915919
struct root_domain *rd;
916920
struct sched_domain __rcu *sd;
@@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void)
24382442
static inline bool sched_energy_enabled(void) { return false; }
24392443

24402444
#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
2445+
2446+
#ifdef CONFIG_MEMBARRIER
2447+
/*
2448+
* The scheduler provides memory barriers required by membarrier between:
2449+
* - prior user-space memory accesses and store to rq->membarrier_state,
2450+
* - store to rq->membarrier_state and following user-space memory accesses.
2451+
* In the same way it provides those guarantees around store to rq->curr.
2452+
*/
2453+
static inline void membarrier_switch_mm(struct rq *rq,
2454+
struct mm_struct *prev_mm,
2455+
struct mm_struct *next_mm)
2456+
{
2457+
int membarrier_state;
2458+
2459+
if (prev_mm == next_mm)
2460+
return;
2461+
2462+
membarrier_state = atomic_read(&next_mm->membarrier_state);
2463+
if (READ_ONCE(rq->membarrier_state) == membarrier_state)
2464+
return;
2465+
2466+
WRITE_ONCE(rq->membarrier_state, membarrier_state);
2467+
}
2468+
#else
2469+
static inline void membarrier_switch_mm(struct rq *rq,
2470+
struct mm_struct *prev_mm,
2471+
struct mm_struct *next_mm)
2472+
{
2473+
}
2474+
#endif

0 commit comments

Comments
 (0)