Skip to content

Commit 0d21e66

Browse files
committed
Merge tag 'aio-poll-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux
Pull aio poll fixes from Eric Biggers: "Fix three bugs in aio poll, and one issue with POLLFREE more broadly: - aio poll didn't handle POLLFREE, causing a use-after-free. - aio poll could block while the file is ready. - aio poll called eventfd_signal() when it isn't allowed. - POLLFREE didn't handle multiple exclusive waiters correctly. This has been tested with the libaio test suite, as well as with test programs I wrote that reproduce the first two bugs. I am sending this pull request myself as no one seems to be maintaining this code" * tag 'aio-poll-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux: aio: Fix incorrect usage of eventfd_signal_allowed() aio: fix use-after-free due to missing POLLFREE handling aio: keep poll requests on waitqueue until completed signalfd: use wake_up_pollfree() binder: use wake_up_pollfree() wait: add wake_up_pollfree()
2 parents b9172f9 + 4b37498 commit 0d21e66

File tree

6 files changed

+196
-58
lines changed

6 files changed

+196
-58
lines changed

drivers/android/binder.c

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4422,23 +4422,20 @@ static int binder_thread_release(struct binder_proc *proc,
44224422
__release(&t->lock);
44234423

44244424
/*
4425-
* If this thread used poll, make sure we remove the waitqueue
4426-
* from any epoll data structures holding it with POLLFREE.
4427-
* waitqueue_active() is safe to use here because we're holding
4428-
* the inner lock.
4425+
* If this thread used poll, make sure we remove the waitqueue from any
4426+
* poll data structures holding it.
44294427
*/
4430-
if ((thread->looper & BINDER_LOOPER_STATE_POLL) &&
4431-
waitqueue_active(&thread->wait)) {
4432-
wake_up_poll(&thread->wait, EPOLLHUP | POLLFREE);
4433-
}
4428+
if (thread->looper & BINDER_LOOPER_STATE_POLL)
4429+
wake_up_pollfree(&thread->wait);
44344430

44354431
binder_inner_proc_unlock(thread->proc);
44364432

44374433
/*
4438-
* This is needed to avoid races between wake_up_poll() above and
4439-
* and ep_remove_waitqueue() called for other reasons (eg the epoll file
4440-
* descriptor being closed); ep_remove_waitqueue() holds an RCU read
4441-
* lock, so we can be sure it's done after calling synchronize_rcu().
4434+
* This is needed to avoid races between wake_up_pollfree() above and
4435+
* someone else removing the last entry from the queue for other reasons
4436+
* (e.g. ep_remove_wait_queue() being called due to an epoll file
4437+
* descriptor being closed). Such other users hold an RCU read lock, so
4438+
* we can be sure they're done after we call synchronize_rcu().
44424439
*/
44434440
if (thread->looper & BINDER_LOOPER_STATE_POLL)
44444441
synchronize_rcu();

fs/aio.c

Lines changed: 152 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,9 @@ struct poll_iocb {
181181
struct file *file;
182182
struct wait_queue_head *head;
183183
__poll_t events;
184-
bool done;
185184
bool cancelled;
185+
bool work_scheduled;
186+
bool work_need_resched;
186187
struct wait_queue_entry wait;
187188
struct work_struct work;
188189
};
@@ -1619,6 +1620,51 @@ static void aio_poll_put_work(struct work_struct *work)
16191620
iocb_put(iocb);
16201621
}
16211622

1623+
/*
1624+
* Safely lock the waitqueue which the request is on, synchronizing with the
1625+
* case where the ->poll() provider decides to free its waitqueue early.
1626+
*
1627+
* Returns true on success, meaning that req->head->lock was locked, req->wait
1628+
* is on req->head, and an RCU read lock was taken. Returns false if the
1629+
* request was already removed from its waitqueue (which might no longer exist).
1630+
*/
1631+
static bool poll_iocb_lock_wq(struct poll_iocb *req)
1632+
{
1633+
wait_queue_head_t *head;
1634+
1635+
/*
1636+
* While we hold the waitqueue lock and the waitqueue is nonempty,
1637+
* wake_up_pollfree() will wait for us. However, taking the waitqueue
1638+
* lock in the first place can race with the waitqueue being freed.
1639+
*
1640+
* We solve this as eventpoll does: by taking advantage of the fact that
1641+
* all users of wake_up_pollfree() will RCU-delay the actual free. If
1642+
* we enter rcu_read_lock() and see that the pointer to the queue is
1643+
* non-NULL, we can then lock it without the memory being freed out from
1644+
* under us, then check whether the request is still on the queue.
1645+
*
1646+
* Keep holding rcu_read_lock() as long as we hold the queue lock, in
1647+
* case the caller deletes the entry from the queue, leaving it empty.
1648+
* In that case, only RCU prevents the queue memory from being freed.
1649+
*/
1650+
rcu_read_lock();
1651+
head = smp_load_acquire(&req->head);
1652+
if (head) {
1653+
spin_lock(&head->lock);
1654+
if (!list_empty(&req->wait.entry))
1655+
return true;
1656+
spin_unlock(&head->lock);
1657+
}
1658+
rcu_read_unlock();
1659+
return false;
1660+
}
1661+
1662+
static void poll_iocb_unlock_wq(struct poll_iocb *req)
1663+
{
1664+
spin_unlock(&req->head->lock);
1665+
rcu_read_unlock();
1666+
}
1667+
16221668
static void aio_poll_complete_work(struct work_struct *work)
16231669
{
16241670
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1638,14 +1684,27 @@ static void aio_poll_complete_work(struct work_struct *work)
16381684
* avoid further branches in the fast path.
16391685
*/
16401686
spin_lock_irq(&ctx->ctx_lock);
1641-
if (!mask && !READ_ONCE(req->cancelled)) {
1642-
add_wait_queue(req->head, &req->wait);
1643-
spin_unlock_irq(&ctx->ctx_lock);
1644-
return;
1645-
}
1687+
if (poll_iocb_lock_wq(req)) {
1688+
if (!mask && !READ_ONCE(req->cancelled)) {
1689+
/*
1690+
* The request isn't actually ready to be completed yet.
1691+
* Reschedule completion if another wakeup came in.
1692+
*/
1693+
if (req->work_need_resched) {
1694+
schedule_work(&req->work);
1695+
req->work_need_resched = false;
1696+
} else {
1697+
req->work_scheduled = false;
1698+
}
1699+
poll_iocb_unlock_wq(req);
1700+
spin_unlock_irq(&ctx->ctx_lock);
1701+
return;
1702+
}
1703+
list_del_init(&req->wait.entry);
1704+
poll_iocb_unlock_wq(req);
1705+
} /* else, POLLFREE has freed the waitqueue, so we must complete */
16461706
list_del_init(&iocb->ki_list);
16471707
iocb->ki_res.res = mangle_poll(mask);
1648-
req->done = true;
16491708
spin_unlock_irq(&ctx->ctx_lock);
16501709

16511710
iocb_put(iocb);
@@ -1657,13 +1716,14 @@ static int aio_poll_cancel(struct kiocb *iocb)
16571716
struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
16581717
struct poll_iocb *req = &aiocb->poll;
16591718

1660-
spin_lock(&req->head->lock);
1661-
WRITE_ONCE(req->cancelled, true);
1662-
if (!list_empty(&req->wait.entry)) {
1663-
list_del_init(&req->wait.entry);
1664-
schedule_work(&aiocb->poll.work);
1665-
}
1666-
spin_unlock(&req->head->lock);
1719+
if (poll_iocb_lock_wq(req)) {
1720+
WRITE_ONCE(req->cancelled, true);
1721+
if (!req->work_scheduled) {
1722+
schedule_work(&aiocb->poll.work);
1723+
req->work_scheduled = true;
1724+
}
1725+
poll_iocb_unlock_wq(req);
1726+
} /* else, the request was force-cancelled by POLLFREE already */
16671727

16681728
return 0;
16691729
}
@@ -1680,21 +1740,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
16801740
if (mask && !(mask & req->events))
16811741
return 0;
16821742

1683-
list_del_init(&req->wait.entry);
1684-
1685-
if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
1743+
/*
1744+
* Complete the request inline if possible. This requires that three
1745+
* conditions be met:
1746+
* 1. An event mask must have been passed. If a plain wakeup was done
1747+
* instead, then mask == 0 and we have to call vfs_poll() to get
1748+
* the events, so inline completion isn't possible.
1749+
* 2. The completion work must not have already been scheduled.
1750+
* 3. ctx_lock must not be busy. We have to use trylock because we
1751+
* already hold the waitqueue lock, so this inverts the normal
1752+
* locking order. Use irqsave/irqrestore because not all
1753+
* filesystems (e.g. fuse) call this function with IRQs disabled,
1754+
* yet IRQs have to be disabled before ctx_lock is obtained.
1755+
*/
1756+
if (mask && !req->work_scheduled &&
1757+
spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
16861758
struct kioctx *ctx = iocb->ki_ctx;
16871759

1688-
/*
1689-
* Try to complete the iocb inline if we can. Use
1690-
* irqsave/irqrestore because not all filesystems (e.g. fuse)
1691-
* call this function with IRQs disabled and because IRQs
1692-
* have to be disabled before ctx_lock is obtained.
1693-
*/
1760+
list_del_init(&req->wait.entry);
16941761
list_del(&iocb->ki_list);
16951762
iocb->ki_res.res = mangle_poll(mask);
1696-
req->done = true;
1697-
if (iocb->ki_eventfd && eventfd_signal_allowed()) {
1763+
if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
16981764
iocb = NULL;
16991765
INIT_WORK(&req->work, aio_poll_put_work);
17001766
schedule_work(&req->work);
@@ -1703,14 +1769,51 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
17031769
if (iocb)
17041770
iocb_put(iocb);
17051771
} else {
1706-
schedule_work(&req->work);
1772+
/*
1773+
* Schedule the completion work if needed. If it was already
1774+
* scheduled, record that another wakeup came in.
1775+
*
1776+
* Don't remove the request from the waitqueue here, as it might
1777+
* not actually be complete yet (we won't know until vfs_poll()
1778+
* is called), and we must not miss any wakeups. POLLFREE is an
1779+
* exception to this; see below.
1780+
*/
1781+
if (req->work_scheduled) {
1782+
req->work_need_resched = true;
1783+
} else {
1784+
schedule_work(&req->work);
1785+
req->work_scheduled = true;
1786+
}
1787+
1788+
/*
1789+
* If the waitqueue is being freed early but we can't complete
1790+
* the request inline, we have to tear down the request as best
1791+
* we can. That means immediately removing the request from its
1792+
* waitqueue and preventing all further accesses to the
1793+
* waitqueue via the request. We also need to schedule the
1794+
* completion work (done above). Also mark the request as
1795+
* cancelled, to potentially skip an unneeded call to ->poll().
1796+
*/
1797+
if (mask & POLLFREE) {
1798+
WRITE_ONCE(req->cancelled, true);
1799+
list_del_init(&req->wait.entry);
1800+
1801+
/*
1802+
* Careful: this *must* be the last step, since as soon
1803+
* as req->head is NULL'ed out, the request can be
1804+
* completed and freed, since aio_poll_complete_work()
1805+
* will no longer need to take the waitqueue lock.
1806+
*/
1807+
smp_store_release(&req->head, NULL);
1808+
}
17071809
}
17081810
return 1;
17091811
}
17101812

17111813
struct aio_poll_table {
17121814
struct poll_table_struct pt;
17131815
struct aio_kiocb *iocb;
1816+
bool queued;
17141817
int error;
17151818
};
17161819

@@ -1721,11 +1824,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
17211824
struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
17221825

17231826
/* multiple wait queues per file are not supported */
1724-
if (unlikely(pt->iocb->poll.head)) {
1827+
if (unlikely(pt->queued)) {
17251828
pt->error = -EINVAL;
17261829
return;
17271830
}
17281831

1832+
pt->queued = true;
17291833
pt->error = 0;
17301834
pt->iocb->poll.head = head;
17311835
add_wait_queue(head, &pt->iocb->poll.wait);
@@ -1750,12 +1854,14 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
17501854
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
17511855

17521856
req->head = NULL;
1753-
req->done = false;
17541857
req->cancelled = false;
1858+
req->work_scheduled = false;
1859+
req->work_need_resched = false;
17551860

17561861
apt.pt._qproc = aio_poll_queue_proc;
17571862
apt.pt._key = req->events;
17581863
apt.iocb = aiocb;
1864+
apt.queued = false;
17591865
apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
17601866

17611867
/* initialized the list so that we can do list_empty checks */
@@ -1764,23 +1870,35 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
17641870

17651871
mask = vfs_poll(req->file, &apt.pt) & req->events;
17661872
spin_lock_irq(&ctx->ctx_lock);
1767-
if (likely(req->head)) {
1768-
spin_lock(&req->head->lock);
1769-
if (unlikely(list_empty(&req->wait.entry))) {
1770-
if (apt.error)
1873+
if (likely(apt.queued)) {
1874+
bool on_queue = poll_iocb_lock_wq(req);
1875+
1876+
if (!on_queue || req->work_scheduled) {
1877+
/*
1878+
* aio_poll_wake() already either scheduled the async
1879+
* completion work, or completed the request inline.
1880+
*/
1881+
if (apt.error) /* unsupported case: multiple queues */
17711882
cancel = true;
17721883
apt.error = 0;
17731884
mask = 0;
17741885
}
17751886
if (mask || apt.error) {
1887+
/* Steal to complete synchronously. */
17761888
list_del_init(&req->wait.entry);
17771889
} else if (cancel) {
1890+
/* Cancel if possible (may be too late though). */
17781891
WRITE_ONCE(req->cancelled, true);
1779-
} else if (!req->done) { /* actually waiting for an event */
1892+
} else if (on_queue) {
1893+
/*
1894+
* Actually waiting for an event, so add the request to
1895+
* active_reqs so that it can be cancelled if needed.
1896+
*/
17801897
list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
17811898
aiocb->ki_cancel = aio_poll_cancel;
17821899
}
1783-
spin_unlock(&req->head->lock);
1900+
if (on_queue)
1901+
poll_iocb_unlock_wq(req);
17841902
}
17851903
if (mask) { /* no async, we'd stolen it */
17861904
aiocb->ki_res.res = mangle_poll(mask);

fs/signalfd.c

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,7 @@
3535

3636
void signalfd_cleanup(struct sighand_struct *sighand)
3737
{
38-
wait_queue_head_t *wqh = &sighand->signalfd_wqh;
39-
/*
40-
* The lockless check can race with remove_wait_queue() in progress,
41-
* but in this case its caller should run under rcu_read_lock() and
42-
* sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
43-
*/
44-
if (likely(!waitqueue_active(wqh)))
45-
return;
46-
47-
/* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */
48-
wake_up_poll(wqh, EPOLLHUP | POLLFREE);
38+
wake_up_pollfree(&sighand->signalfd_wqh);
4939
}
5040

5141
struct signalfd_ctx {

include/linux/wait.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void
217217
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
218218
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
219219
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
220+
void __wake_up_pollfree(struct wait_queue_head *wq_head);
220221

221222
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
222223
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
@@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
245246
#define wake_up_interruptible_sync_poll_locked(x, m) \
246247
__wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
247248

249+
/**
250+
* wake_up_pollfree - signal that a polled waitqueue is going away
251+
* @wq_head: the wait queue head
252+
*
253+
* In the very rare cases where a ->poll() implementation uses a waitqueue whose
254+
* lifetime is tied to a task rather than to the 'struct file' being polled,
255+
* this function must be called before the waitqueue is freed so that
256+
* non-blocking polls (e.g. epoll) are notified that the queue is going away.
257+
*
258+
* The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
259+
* an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
260+
*/
261+
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
262+
{
263+
/*
264+
* For performance reasons, we don't always take the queue lock here.
265+
* Therefore, we might race with someone removing the last entry from
266+
* the queue, and proceed while they still hold the queue lock.
267+
* However, rcu_read_lock() is required to be held in such cases, so we
268+
* can safely proceed with an RCU-delayed free.
269+
*/
270+
if (waitqueue_active(wq_head))
271+
__wake_up_pollfree(wq_head);
272+
}
273+
248274
#define ___wait_cond_timeout(condition) \
249275
({ \
250276
bool __cond = (condition); \

include/uapi/asm-generic/poll.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
#define POLLRDHUP 0x2000
3030
#endif
3131

32-
#define POLLFREE (__force __poll_t)0x4000 /* currently only for epoll */
32+
#define POLLFREE (__force __poll_t)0x4000
3333

3434
#define POLL_BUSY_LOOP (__force __poll_t)0x8000
3535

kernel/sched/wait.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
238238
}
239239
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
240240

241+
void __wake_up_pollfree(struct wait_queue_head *wq_head)
242+
{
243+
__wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE));
244+
/* POLLFREE must have cleared the queue. */
245+
WARN_ON_ONCE(waitqueue_active(wq_head));
246+
}
247+
241248
/*
242249
* Note: we use "set_current_state()" _after_ the wait-queue add,
243250
* because we need a memory barrier there on SMP, so that any

0 commit comments

Comments
 (0)