Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 9 additions & 12 deletions drivers/android/binder.c
Original file line number Diff line number Diff line change
Expand Up @@ -4420,23 +4420,20 @@ static int binder_thread_release(struct binder_proc *proc,
__release(&t->lock);

/*
* If this thread used poll, make sure we remove the waitqueue
* from any epoll data structures holding it with POLLFREE.
* waitqueue_active() is safe to use here because we're holding
* the inner lock.
* If this thread used poll, make sure we remove the waitqueue from any
* poll data structures holding it.
*/
if ((thread->looper & BINDER_LOOPER_STATE_POLL) &&
waitqueue_active(&thread->wait)) {
wake_up_poll(&thread->wait, EPOLLHUP | POLLFREE);
}
if (thread->looper & BINDER_LOOPER_STATE_POLL)
wake_up_pollfree(&thread->wait);

binder_inner_proc_unlock(thread->proc);

/*
* This is needed to avoid races between wake_up_poll() above and
* and ep_remove_waitqueue() called for other reasons (eg the epoll file
* descriptor being closed); ep_remove_waitqueue() holds an RCU read
* lock, so we can be sure it's done after calling synchronize_rcu().
* This is needed to avoid races between wake_up_pollfree() above and
* someone else removing the last entry from the queue for other reasons
* (e.g. ep_remove_wait_queue() being called due to an epoll file
* descriptor being closed). Such other users hold an RCU read lock, so
* we can be sure they're done after we call synchronize_rcu().
*/
if (thread->looper & BINDER_LOOPER_STATE_POLL)
synchronize_rcu();
Expand Down
184 changes: 151 additions & 33 deletions fs/aio.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,9 @@ struct poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
bool done;
bool cancelled;
bool work_scheduled;
bool work_need_resched;
struct wait_queue_entry wait;
struct work_struct work;
};
Expand Down Expand Up @@ -1623,6 +1624,51 @@ static void aio_poll_put_work(struct work_struct *work)
iocb_put(iocb);
}

/*
* Safely lock the waitqueue which the request is on, synchronizing with the
* case where the ->poll() provider decides to free its waitqueue early.
*
* Returns true on success, meaning that req->head->lock was locked, req->wait
* is on req->head, and an RCU read lock was taken. Returns false if the
* request was already removed from its waitqueue (which might no longer exist).
*/
static bool poll_iocb_lock_wq(struct poll_iocb *req)
{
wait_queue_head_t *head;

/*
* While we hold the waitqueue lock and the waitqueue is nonempty,
* wake_up_pollfree() will wait for us. However, taking the waitqueue
* lock in the first place can race with the waitqueue being freed.
*
* We solve this as eventpoll does: by taking advantage of the fact that
* all users of wake_up_pollfree() will RCU-delay the actual free. If
* we enter rcu_read_lock() and see that the pointer to the queue is
* non-NULL, we can then lock it without the memory being freed out from
* under us, then check whether the request is still on the queue.
*
* Keep holding rcu_read_lock() as long as we hold the queue lock, in
* case the caller deletes the entry from the queue, leaving it empty.
* In that case, only RCU prevents the queue memory from being freed.
*/
rcu_read_lock();
head = smp_load_acquire(&req->head);
if (head) {
spin_lock(&head->lock);
if (!list_empty(&req->wait.entry))
return true;
spin_unlock(&head->lock);
}
rcu_read_unlock();
return false;
}

static void poll_iocb_unlock_wq(struct poll_iocb *req)
{
spin_unlock(&req->head->lock);
rcu_read_unlock();
}

static void aio_poll_complete_work(struct work_struct *work)
{
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
Expand All @@ -1642,14 +1688,27 @@ static void aio_poll_complete_work(struct work_struct *work)
* avoid further branches in the fast path.
*/
spin_lock_irq(&ctx->ctx_lock);
if (!mask && !READ_ONCE(req->cancelled)) {
add_wait_queue(req->head, &req->wait);
spin_unlock_irq(&ctx->ctx_lock);
return;
}
if (poll_iocb_lock_wq(req)) {
if (!mask && !READ_ONCE(req->cancelled)) {
/*
* The request isn't actually ready to be completed yet.
* Reschedule completion if another wakeup came in.
*/
if (req->work_need_resched) {
schedule_work(&req->work);
req->work_need_resched = false;
} else {
req->work_scheduled = false;
}
poll_iocb_unlock_wq(req);
spin_unlock_irq(&ctx->ctx_lock);
return;
}
list_del_init(&req->wait.entry);
poll_iocb_unlock_wq(req);
} /* else, POLLFREE has freed the waitqueue, so we must complete */
list_del_init(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
req->done = true;
spin_unlock_irq(&ctx->ctx_lock);

iocb_put(iocb);
Expand All @@ -1661,13 +1720,14 @@ static int aio_poll_cancel(struct kiocb *iocb)
struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
struct poll_iocb *req = &aiocb->poll;

spin_lock(&req->head->lock);
WRITE_ONCE(req->cancelled, true);
if (!list_empty(&req->wait.entry)) {
list_del_init(&req->wait.entry);
schedule_work(&aiocb->poll.work);
}
spin_unlock(&req->head->lock);
if (poll_iocb_lock_wq(req)) {
WRITE_ONCE(req->cancelled, true);
if (!req->work_scheduled) {
schedule_work(&aiocb->poll.work);
req->work_scheduled = true;
}
poll_iocb_unlock_wq(req);
} /* else, the request was force-cancelled by POLLFREE already */

return 0;
}
Expand All @@ -1684,20 +1744,26 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & req->events))
return 0;

list_del_init(&req->wait.entry);

if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
/*
* Complete the request inline if possible. This requires that three
* conditions be met:
* 1. An event mask must have been passed. If a plain wakeup was done
* instead, then mask == 0 and we have to call vfs_poll() to get
* the events, so inline completion isn't possible.
* 2. The completion work must not have already been scheduled.
* 3. ctx_lock must not be busy. We have to use trylock because we
* already hold the waitqueue lock, so this inverts the normal
* locking order. Use irqsave/irqrestore because not all
* filesystems (e.g. fuse) call this function with IRQs disabled,
* yet IRQs have to be disabled before ctx_lock is obtained.
*/
if (mask && !req->work_scheduled &&
spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
struct kioctx *ctx = iocb->ki_ctx;

/*
* Try to complete the iocb inline if we can. Use
* irqsave/irqrestore because not all filesystems (e.g. fuse)
* call this function with IRQs disabled and because IRQs
* have to be disabled before ctx_lock is obtained.
*/
list_del_init(&req->wait.entry);
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
req->done = true;
if (iocb->ki_eventfd && eventfd_signal_allowed()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
Expand All @@ -1707,14 +1773,51 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (iocb)
iocb_put(iocb);
} else {
schedule_work(&req->work);
/*
* Schedule the completion work if needed. If it was already
* scheduled, record that another wakeup came in.
*
* Don't remove the request from the waitqueue here, as it might
* not actually be complete yet (we won't know until vfs_poll()
* is called), and we must not miss any wakeups. POLLFREE is an
* exception to this; see below.
*/
if (req->work_scheduled) {
req->work_need_resched = true;
} else {
schedule_work(&req->work);
req->work_scheduled = true;
}

/*
* If the waitqueue is being freed early but we can't complete
* the request inline, we have to tear down the request as best
* we can. That means immediately removing the request from its
* waitqueue and preventing all further accesses to the
* waitqueue via the request. We also need to schedule the
* completion work (done above). Also mark the request as
* cancelled, to potentially skip an unneeded call to ->poll().
*/
if (mask & POLLFREE) {
WRITE_ONCE(req->cancelled, true);
list_del_init(&req->wait.entry);

/*
* Careful: this *must* be the last step, since as soon
* as req->head is NULL'ed out, the request can be
* completed and freed, since aio_poll_complete_work()
* will no longer need to take the waitqueue lock.
*/
smp_store_release(&req->head, NULL);
}
}
return 1;
}

struct aio_poll_table {
struct poll_table_struct pt;
struct aio_kiocb *iocb;
bool queued;
int error;
};

Expand All @@ -1725,11 +1828,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);

/* multiple wait queues per file are not supported */
if (unlikely(pt->iocb->poll.head)) {
if (unlikely(pt->queued)) {
pt->error = -EINVAL;
return;
}

pt->queued = true;
pt->error = 0;
pt->iocb->poll.head = head;
add_wait_queue(head, &pt->iocb->poll.wait);
Expand All @@ -1754,12 +1858,14 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;

req->head = NULL;
req->done = false;
req->cancelled = false;
req->work_scheduled = false;
req->work_need_resched = false;

apt.pt._qproc = aio_poll_queue_proc;
apt.pt._key = req->events;
apt.iocb = aiocb;
apt.queued = false;
apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */

/* initialized the list so that we can do list_empty checks */
Expand All @@ -1768,23 +1874,35 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)

mask = vfs_poll(req->file, &apt.pt) & req->events;
spin_lock_irq(&ctx->ctx_lock);
if (likely(req->head)) {
spin_lock(&req->head->lock);
if (unlikely(list_empty(&req->wait.entry))) {
if (apt.error)
if (likely(apt.queued)) {
bool on_queue = poll_iocb_lock_wq(req);

if (!on_queue || req->work_scheduled) {
/*
* aio_poll_wake() already either scheduled the async
* completion work, or completed the request inline.
*/
if (apt.error) /* unsupported case: multiple queues */
cancel = true;
apt.error = 0;
mask = 0;
}
if (mask || apt.error) {
/* Steal to complete synchronously. */
list_del_init(&req->wait.entry);
} else if (cancel) {
/* Cancel if possible (may be too late though). */
WRITE_ONCE(req->cancelled, true);
} else if (!req->done) { /* actually waiting for an event */
} else if (on_queue) {
/*
* Actually waiting for an event, so add the request to
* active_reqs so that it can be cancelled if needed.
*/
list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
aiocb->ki_cancel = aio_poll_cancel;
}
spin_unlock(&req->head->lock);
if (on_queue)
poll_iocb_unlock_wq(req);
}
if (mask) { /* no async, we'd stolen it */
aiocb->ki_res.res = mangle_poll(mask);
Expand Down
12 changes: 1 addition & 11 deletions fs/signalfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,7 @@

void signalfd_cleanup(struct sighand_struct *sighand)
{
wait_queue_head_t *wqh = &sighand->signalfd_wqh;
/*
* The lockless check can race with remove_wait_queue() in progress,
* but in this case its caller should run under rcu_read_lock() and
* sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
*/
if (likely(!waitqueue_active(wqh)))
return;

/* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */
wake_up_poll(wqh, EPOLLHUP | POLLFREE);
wake_up_pollfree(&sighand->signalfd_wqh);
}

struct signalfd_ctx {
Expand Down
26 changes: 26 additions & 0 deletions include/linux/wait.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
void __wake_up_pollfree(struct wait_queue_head *wq_head);

#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
Expand Down Expand Up @@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
#define wake_up_interruptible_sync_poll_locked(x, m) \
__wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))

/**
* wake_up_pollfree - signal that a polled waitqueue is going away
* @wq_head: the wait queue head
*
* In the very rare cases where a ->poll() implementation uses a waitqueue whose
* lifetime is tied to a task rather than to the 'struct file' being polled,
* this function must be called before the waitqueue is freed so that
* non-blocking polls (e.g. epoll) are notified that the queue is going away.
*
* The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
* an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
*/
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
{
/*
* For performance reasons, we don't always take the queue lock here.
* Therefore, we might race with someone removing the last entry from
* the queue, and proceed while they still hold the queue lock.
* However, rcu_read_lock() is required to be held in such cases, so we
* can safely proceed with an RCU-delayed free.
*/
if (waitqueue_active(wq_head))
__wake_up_pollfree(wq_head);
}

#define ___wait_cond_timeout(condition) \
({ \
bool __cond = (condition); \
Expand Down
2 changes: 1 addition & 1 deletion include/uapi/asm-generic/poll.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#define POLLRDHUP 0x2000
#endif

#define POLLFREE (__force __poll_t)0x4000 /* currently only for epoll */
#define POLLFREE (__force __poll_t)0x4000

#define POLL_BUSY_LOOP (__force __poll_t)0x8000

Expand Down
7 changes: 4 additions & 3 deletions kernel/sched/psi.c
Original file line number Diff line number Diff line change
Expand Up @@ -1200,10 +1200,11 @@ void psi_trigger_destroy(struct psi_trigger *t)

group = t->group;
/*
* Wakeup waiters to stop polling. Can happen if cgroup is deleted
* from under a polling process.
* Wakeup waiters to stop polling and clear the queue to prevent it from
* being accessed later. Can happen if cgroup is deleted from under a
* polling process.
*/
wake_up_interruptible(&t->event_wait);
wake_up_pollfree(&t->event_wait);

mutex_lock(&group->trigger_lock);

Expand Down
Loading