Skip to content

Commit 3ef240e

Browse files
committed
futex: Prevent exit livelock
Oleg provided the following test case: int main(void) { struct sched_param sp = {}; sp.sched_priority = 2; assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0); int lock = vfork(); if (!lock) { sp.sched_priority = 1; assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0); _exit(0); } syscall(__NR_futex, &lock, FUTEX_LOCK_PI, 0,0,0); return 0; } This creates an unkillable RT process spinning in futex_lock_pi() on a UP machine or if the process is affine to a single CPU. The reason is: parent child set FIFO prio 2 vfork() -> set FIFO prio 1 implies wait_for_child() sched_setscheduler(...) exit() do_exit() .... mm_release() tsk->futex_state = FUTEX_STATE_EXITING; exit_futex(); (NOOP in this case) complete() --> wakes parent sys_futex() loop infinite because tsk->futex_state == FUTEX_STATE_EXITING The same problem can happen just by regular preemption as well: task holds futex ... do_exit() tsk->futex_state = FUTEX_STATE_EXITING; --> preemption (unrelated wakeup of some other higher prio task, e.g. timer) switch_to(other_task) return to user sys_futex() loop infinite as above Just for the fun of it the futex exit cleanup could trigger the wakeup itself before the task sets its futex state to DEAD. To cure this, the handling of the exiting owner is changed so: - A refcount is held on the task - The task pointer is stored in a caller visible location - The caller drops all locks (hash bucket, mmap_sem) and blocks on task::futex_exit_mutex. When the mutex is acquired then the exiting task has completed the cleanup and the state is consistent and can be reevaluated. This is not a pretty solution, but there is no choice other than returning an error code to user space, which would break the state consistency guarantee and open another can of problems including regressions. For stable backports the preparatory commits ac31c7f .. ba31c1a are required as well, but for anything older than 5.3.y the backports are going to be provided when this hits mainline as the other dependencies for those kernels are definitely not stable material. Fixes: 778e9a9 ("pi-futex: fix exit races and locking problems") Reported-by: Oleg Nesterov <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]> Reviewed-by: Ingo Molnar <[email protected]> Acked-by: Peter Zijlstra (Intel) <[email protected]> Cc: Stable Team <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent ac31c7f commit 3ef240e

File tree

1 file changed

+91
-15
lines changed

1 file changed

+91
-15
lines changed

kernel/futex.c

Lines changed: 91 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,6 +1176,36 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
11761176
return ret;
11771177
}
11781178

1179+
/**
1180+
* wait_for_owner_exiting - Block until the owner has exited
1181+
* @exiting: Pointer to the exiting task
1182+
*
1183+
* Caller must hold a refcount on @exiting.
1184+
*/
1185+
static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
1186+
{
1187+
if (ret != -EBUSY) {
1188+
WARN_ON_ONCE(exiting);
1189+
return;
1190+
}
1191+
1192+
if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
1193+
return;
1194+
1195+
mutex_lock(&exiting->futex_exit_mutex);
1196+
/*
1197+
* No point in doing state checking here. If the waiter got here
1198+
* while the task was in exec()->exec_futex_release() then it can
1199+
* have any FUTEX_STATE_* value when the waiter has acquired the
1200+
* mutex. OK, if running, EXITING or DEAD if it reached exit()
1201+
* already. Highly unlikely and not a problem. Just one more round
1202+
* through the futex maze.
1203+
*/
1204+
mutex_unlock(&exiting->futex_exit_mutex);
1205+
1206+
put_task_struct(exiting);
1207+
}
1208+
11791209
static int handle_exit_race(u32 __user *uaddr, u32 uval,
11801210
struct task_struct *tsk)
11811211
{
@@ -1237,7 +1267,8 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
12371267
* it after doing proper sanity checks.
12381268
*/
12391269
static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
1240-
struct futex_pi_state **ps)
1270+
struct futex_pi_state **ps,
1271+
struct task_struct **exiting)
12411272
{
12421273
pid_t pid = uval & FUTEX_TID_MASK;
12431274
struct futex_pi_state *pi_state;
@@ -1276,7 +1307,19 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
12761307
int ret = handle_exit_race(uaddr, uval, p);
12771308

12781309
raw_spin_unlock_irq(&p->pi_lock);
1279-
put_task_struct(p);
1310+
/*
1311+
* If the owner task is between FUTEX_STATE_EXITING and
1312+
* FUTEX_STATE_DEAD then store the task pointer and keep
1313+
* the reference on the task struct. The calling code will
1314+
* drop all locks, wait for the task to reach
1315+
* FUTEX_STATE_DEAD and then drop the refcount. This is
1316+
* required to prevent a live lock when the current task
1317+
* preempted the exiting task between the two states.
1318+
*/
1319+
if (ret == -EBUSY)
1320+
*exiting = p;
1321+
else
1322+
put_task_struct(p);
12801323
return ret;
12811324
}
12821325

@@ -1315,7 +1358,8 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
13151358

13161359
static int lookup_pi_state(u32 __user *uaddr, u32 uval,
13171360
struct futex_hash_bucket *hb,
1318-
union futex_key *key, struct futex_pi_state **ps)
1361+
union futex_key *key, struct futex_pi_state **ps,
1362+
struct task_struct **exiting)
13191363
{
13201364
struct futex_q *top_waiter = futex_top_waiter(hb, key);
13211365

@@ -1330,7 +1374,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
13301374
* We are the first waiter - try to look up the owner based on
13311375
* @uval and attach to it.
13321376
*/
1333-
return attach_to_pi_owner(uaddr, uval, key, ps);
1377+
return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
13341378
}
13351379

13361380
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
@@ -1358,6 +1402,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
13581402
* lookup
13591403
* @task: the task to perform the atomic lock work for. This will
13601404
* be "current" except in the case of requeue pi.
1405+
* @exiting: Pointer to store the task pointer of the owner task
1406+
* which is in the middle of exiting
13611407
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
13621408
*
13631409
* Return:
@@ -1366,11 +1412,17 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
13661412
* - <0 - error
13671413
*
13681414
* The hb->lock and futex_key refs shall be held by the caller.
1415+
*
1416+
* @exiting is only set when the return value is -EBUSY. If so, this holds
1417+
* a refcount on the exiting task on return and the caller needs to drop it
1418+
* after waiting for the exit to complete.
13691419
*/
13701420
static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
13711421
union futex_key *key,
13721422
struct futex_pi_state **ps,
1373-
struct task_struct *task, int set_waiters)
1423+
struct task_struct *task,
1424+
struct task_struct **exiting,
1425+
int set_waiters)
13741426
{
13751427
u32 uval, newval, vpid = task_pid_vnr(task);
13761428
struct futex_q *top_waiter;
@@ -1440,7 +1492,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
14401492
* attach to the owner. If that fails, no harm done, we only
14411493
* set the FUTEX_WAITERS bit in the user space variable.
14421494
*/
1443-
return attach_to_pi_owner(uaddr, newval, key, ps);
1495+
return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
14441496
}
14451497

14461498
/**
@@ -1858,23 +1910,29 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
18581910
* @key1: the from futex key
18591911
* @key2: the to futex key
18601912
* @ps: address to store the pi_state pointer
1913+
* @exiting: Pointer to store the task pointer of the owner task
1914+
* which is in the middle of exiting
18611915
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
18621916
*
18631917
* Try and get the lock on behalf of the top waiter if we can do it atomically.
18641918
* Wake the top waiter if we succeed. If the caller specified set_waiters,
18651919
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
18661920
* hb1 and hb2 must be held by the caller.
18671921
*
1922+
* @exiting is only set when the return value is -EBUSY. If so, this holds
1923+
* a refcount on the exiting task on return and the caller needs to drop it
1924+
* after waiting for the exit to complete.
1925+
*
18681926
* Return:
18691927
* - 0 - failed to acquire the lock atomically;
18701928
* - >0 - acquired the lock, return value is vpid of the top_waiter
18711929
* - <0 - error
18721930
*/
1873-
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1874-
struct futex_hash_bucket *hb1,
1875-
struct futex_hash_bucket *hb2,
1876-
union futex_key *key1, union futex_key *key2,
1877-
struct futex_pi_state **ps, int set_waiters)
1931+
static int
1932+
futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
1933+
struct futex_hash_bucket *hb2, union futex_key *key1,
1934+
union futex_key *key2, struct futex_pi_state **ps,
1935+
struct task_struct **exiting, int set_waiters)
18781936
{
18791937
struct futex_q *top_waiter = NULL;
18801938
u32 curval;
@@ -1911,7 +1969,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
19111969
*/
19121970
vpid = task_pid_vnr(top_waiter->task);
19131971
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1914-
set_waiters);
1972+
exiting, set_waiters);
19151973
if (ret == 1) {
19161974
requeue_pi_wake_futex(top_waiter, key2, hb2);
19171975
return vpid;
@@ -2040,14 +2098,17 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
20402098
}
20412099

20422100
if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
2101+
struct task_struct *exiting = NULL;
2102+
20432103
/*
20442104
* Attempt to acquire uaddr2 and wake the top waiter. If we
20452105
* intend to requeue waiters, force setting the FUTEX_WAITERS
20462106
* bit. We force this here where we are able to easily handle
20472107
* faults rather in the requeue loop below.
20482108
*/
20492109
ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
2050-
&key2, &pi_state, nr_requeue);
2110+
&key2, &pi_state,
2111+
&exiting, nr_requeue);
20512112

20522113
/*
20532114
* At this point the top_waiter has either taken uaddr2 or is
@@ -2074,7 +2135,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
20742135
* If that call succeeds then we have pi_state and an
20752136
* initial refcount on it.
20762137
*/
2077-
ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
2138+
ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
2139+
&pi_state, &exiting);
20782140
}
20792141

20802142
switch (ret) {
@@ -2104,6 +2166,12 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
21042166
hb_waiters_dec(hb2);
21052167
put_futex_key(&key2);
21062168
put_futex_key(&key1);
2169+
/*
2170+
* Handle the case where the owner is in the middle of
2171+
* exiting. Wait for the exit to complete otherwise
2172+
* this task might loop forever, aka. live lock.
2173+
*/
2174+
wait_for_owner_exiting(ret, exiting);
21072175
cond_resched();
21082176
goto retry;
21092177
default:
@@ -2810,6 +2878,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
28102878
{
28112879
struct hrtimer_sleeper timeout, *to;
28122880
struct futex_pi_state *pi_state = NULL;
2881+
struct task_struct *exiting = NULL;
28132882
struct rt_mutex_waiter rt_waiter;
28142883
struct futex_hash_bucket *hb;
28152884
struct futex_q q = futex_q_init;
@@ -2831,7 +2900,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
28312900
retry_private:
28322901
hb = queue_lock(&q);
28332902

2834-
ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
2903+
ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
2904+
&exiting, 0);
28352905
if (unlikely(ret)) {
28362906
/*
28372907
* Atomic work succeeded and we got the lock,
@@ -2854,6 +2924,12 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
28542924
*/
28552925
queue_unlock(hb);
28562926
put_futex_key(&q.key);
2927+
/*
2928+
* Handle the case where the owner is in the middle of
2929+
* exiting. Wait for the exit to complete otherwise
2930+
* this task might loop forever, aka. live lock.
2931+
*/
2932+
wait_for_owner_exiting(ret, exiting);
28572933
cond_resched();
28582934
goto retry;
28592935
default:

0 commit comments

Comments
 (0)