Skip to content

Commit 3d4775d

Browse files
committed
futex: Replace PF_EXITPIDONE with a state
The futex exit handling relies on PF_ flags. That's suboptimal as it requires a smp_mb() and an ugly lock/unlock of the exiting tasks pi_lock in the middle of do_exit() to enforce the observability of PF_EXITING in the futex code. Add a futex_state member to task_struct and convert the PF_EXITPIDONE logic over to the new state. The PF_EXITING dependency will be cleaned up in a later step. This prepares for handling various futex exit issues later. Signed-off-by: Thomas Gleixner <[email protected]> Reviewed-by: Ingo Molnar <[email protected]> Acked-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent ba31c1a commit 3d4775d

File tree

4 files changed

+49
-29
lines changed

4 files changed

+49
-29
lines changed

include/linux/futex.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ union futex_key {
5050
#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
5151

5252
#ifdef CONFIG_FUTEX
53+
enum {
54+
FUTEX_STATE_OK,
55+
FUTEX_STATE_DEAD,
56+
};
5357

5458
static inline void futex_init_task(struct task_struct *tsk)
5559
{
@@ -59,6 +63,34 @@ static inline void futex_init_task(struct task_struct *tsk)
5963
#endif
6064
INIT_LIST_HEAD(&tsk->pi_state_list);
6165
tsk->pi_state_cache = NULL;
66+
tsk->futex_state = FUTEX_STATE_OK;
67+
}
68+
69+
/**
70+
* futex_exit_done - Sets the tasks futex state to FUTEX_STATE_DEAD
71+
* @tsk: task to set the state on
72+
*
73+
* Set the futex exit state of the task lockless. The futex waiter code
74+
* observes that state when a task is exiting and loops until the task has
75+
* actually finished the futex cleanup. The worst case for this is that the
76+
* waiter runs through the wait loop until the state becomes visible.
77+
*
78+
* This has two callers:
79+
*
80+
* - futex_mm_release() after the futex exit cleanup has been done
81+
*
82+
* - do_exit() from the recursive fault handling path.
83+
*
84+
* In case of a recursive fault this is best effort. Either the futex exit
85+
* code has run already or not. If the OWNER_DIED bit has been set on the
86+
* futex then the waiter can take it over. If not, the problem is pushed
87+
* back to user space. If the futex exit code did not run yet, then an
88+
* already queued waiter might block forever, but there is nothing which
89+
* can be done about that.
90+
*/
91+
static inline void futex_exit_done(struct task_struct *tsk)
92+
{
93+
tsk->futex_state = FUTEX_STATE_DEAD;
6294
}
6395

6496
void futex_mm_release(struct task_struct *tsk);
@@ -68,6 +100,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
68100
#else
69101
static inline void futex_init_task(struct task_struct *tsk) { }
70102
static inline void futex_mm_release(struct task_struct *tsk) { }
103+
static inline void futex_exit_done(struct task_struct *tsk) { }
71104
static inline long do_futex(u32 __user *uaddr, int op, u32 val,
72105
ktime_t *timeout, u32 __user *uaddr2,
73106
u32 val2, u32 val3)

include/linux/sched.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1053,6 +1053,7 @@ struct task_struct {
10531053
#endif
10541054
struct list_head pi_state_list;
10551055
struct futex_pi_state *pi_state_cache;
1056+
unsigned int futex_state;
10561057
#endif
10571058
#ifdef CONFIG_PERF_EVENTS
10581059
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
@@ -1441,7 +1442,6 @@ extern struct pid *cad_pid;
14411442
*/
14421443
#define PF_IDLE 0x00000002 /* I am an IDLE thread */
14431444
#define PF_EXITING 0x00000004 /* Getting shut down */
1444-
#define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */
14451445
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
14461446
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
14471447
#define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */

kernel/exit.c

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -746,16 +746,7 @@ void __noreturn do_exit(long code)
746746
*/
747747
if (unlikely(tsk->flags & PF_EXITING)) {
748748
pr_alert("Fixing recursive fault but reboot is needed!\n");
749-
/*
750-
* We can do this unlocked here. The futex code uses
751-
* this flag just to verify whether the pi state
752-
* cleanup has been done or not. In the worst case it
753-
* loops once more. We pretend that the cleanup was
754-
* done as there is no way to return. Either the
755-
* OWNER_DIED bit is set by now or we push the blocked
756-
* task into the wait for ever nirwana as well.
757-
*/
758-
tsk->flags |= PF_EXITPIDONE;
749+
futex_exit_done(tsk);
759750
set_current_state(TASK_UNINTERRUPTIBLE);
760751
schedule();
761752
}
@@ -846,12 +837,7 @@ void __noreturn do_exit(long code)
846837
* Make sure we are holding no locks:
847838
*/
848839
debug_check_no_locks_held();
849-
/*
850-
* We can do this unlocked here. The futex code uses this flag
851-
* just to verify whether the pi state cleanup has been done
852-
* or not. In the worst case it loops once more.
853-
*/
854-
tsk->flags |= PF_EXITPIDONE;
840+
futex_exit_done(tsk);
855841

856842
if (tsk->io_context)
857843
exit_io_context(tsk);

kernel/futex.c

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,9 +1182,10 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
11821182
u32 uval2;
11831183

11841184
/*
1185-
* If PF_EXITPIDONE is not yet set, then try again.
1185+
* If the futex exit state is not yet FUTEX_STATE_DEAD, wait
1186+
* for it to finish.
11861187
*/
1187-
if (tsk && !(tsk->flags & PF_EXITPIDONE))
1188+
if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
11881189
return -EAGAIN;
11891190

11901191
/*
@@ -1203,8 +1204,9 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
12031204
* *uaddr = 0xC0000000; tsk = get_task(PID);
12041205
* } if (!tsk->flags & PF_EXITING) {
12051206
* ... attach();
1206-
* tsk->flags |= PF_EXITPIDONE; } else {
1207-
* if (!(tsk->flags & PF_EXITPIDONE))
1207+
* tsk->futex_state = } else {
1208+
* FUTEX_STATE_DEAD; if (tsk->futex_state !=
1209+
* FUTEX_STATE_DEAD)
12081210
* return -EAGAIN;
12091211
* return -ESRCH; <--- FAIL
12101212
* }
@@ -1260,17 +1262,16 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
12601262
}
12611263

12621264
/*
1263-
* We need to look at the task state flags to figure out,
1264-
* whether the task is exiting. To protect against the do_exit
1265-
* change of the task flags, we do this protected by
1266-
* p->pi_lock:
1265+
* We need to look at the task state to figure out, whether the
1266+
* task is exiting. To protect against the change of the task state
1267+
* in futex_exit_release(), we do this protected by p->pi_lock:
12671268
*/
12681269
raw_spin_lock_irq(&p->pi_lock);
1269-
if (unlikely(p->flags & PF_EXITING)) {
1270+
if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
12701271
/*
1271-
* The task is on the way out. When PF_EXITPIDONE is
1272-
* set, we know that the task has finished the
1273-
* cleanup:
1272+
* The task is on the way out. When the futex state is
1273+
* FUTEX_STATE_DEAD, we know that the task has finished
1274+
* the cleanup:
12741275
*/
12751276
int ret = handle_exit_race(uaddr, uval, p);
12761277

0 commit comments

Comments
 (0)