Skip to content

Commit 363bee2

Browse files
committed
aio: keep poll requests on waitqueue until completed
Currently, aio_poll_wake() will always remove the poll request from the waitqueue. Then, if aio_poll_complete_work() sees that none of the polled events are ready and the request isn't cancelled, it re-adds the request to the waitqueue. (This can easily happen when polling a file that doesn't pass an event mask when waking up its waitqueue.) This is fundamentally broken for two reasons: 1. If a wakeup occurs between vfs_poll() and the request being re-added to the waitqueue, it will be missed because the request wasn't on the waitqueue at the time. Therefore, IOCB_CMD_POLL might never complete even if the polled file is ready. 2. When the request isn't on the waitqueue, there is no way to be notified that the waitqueue is being freed (which happens when its lifetime is shorter than the struct file's). This is supposed to happen via the waitqueue entries being woken up with POLLFREE. Therefore, leave the requests on the waitqueue until they are actually completed (or cancelled). To keep track of when aio_poll_complete_work needs to be scheduled, use new fields in struct poll_iocb. Remove the 'done' field which is now redundant. Note that this is consistent with how sys_poll() and eventpoll work; their wakeup functions do *not* remove the waitqueue entries. Fixes: 2c14fa8 ("aio: implement IOCB_CMD_POLL") Cc: <[email protected]> # v4.18+ Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Eric Biggers <[email protected]>
1 parent 9537bae commit 363bee2

File tree

1 file changed

+63
-20
lines changed

1 file changed

+63
-20
lines changed

fs/aio.c

Lines changed: 63 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,9 @@ struct poll_iocb {
181181
struct file *file;
182182
struct wait_queue_head *head;
183183
__poll_t events;
184-
bool done;
185184
bool cancelled;
185+
bool work_scheduled;
186+
bool work_need_resched;
186187
struct wait_queue_entry wait;
187188
struct work_struct work;
188189
};
@@ -1638,14 +1639,26 @@ static void aio_poll_complete_work(struct work_struct *work)
16381639
* avoid further branches in the fast path.
16391640
*/
16401641
spin_lock_irq(&ctx->ctx_lock);
1642+
spin_lock(&req->head->lock);
16411643
if (!mask && !READ_ONCE(req->cancelled)) {
1642-
add_wait_queue(req->head, &req->wait);
1644+
/*
1645+
* The request isn't actually ready to be completed yet.
1646+
* Reschedule completion if another wakeup came in.
1647+
*/
1648+
if (req->work_need_resched) {
1649+
schedule_work(&req->work);
1650+
req->work_need_resched = false;
1651+
} else {
1652+
req->work_scheduled = false;
1653+
}
1654+
spin_unlock(&req->head->lock);
16431655
spin_unlock_irq(&ctx->ctx_lock);
16441656
return;
16451657
}
1658+
list_del_init(&req->wait.entry);
1659+
spin_unlock(&req->head->lock);
16461660
list_del_init(&iocb->ki_list);
16471661
iocb->ki_res.res = mangle_poll(mask);
1648-
req->done = true;
16491662
spin_unlock_irq(&ctx->ctx_lock);
16501663

16511664
iocb_put(iocb);
@@ -1659,9 +1672,9 @@ static int aio_poll_cancel(struct kiocb *iocb)
16591672

16601673
spin_lock(&req->head->lock);
16611674
WRITE_ONCE(req->cancelled, true);
1662-
if (!list_empty(&req->wait.entry)) {
1663-
list_del_init(&req->wait.entry);
1675+
if (!req->work_scheduled) {
16641676
schedule_work(&aiocb->poll.work);
1677+
req->work_scheduled = true;
16651678
}
16661679
spin_unlock(&req->head->lock);
16671680

@@ -1680,20 +1693,26 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
16801693
if (mask && !(mask & req->events))
16811694
return 0;
16821695

1683-
list_del_init(&req->wait.entry);
1684-
1685-
if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
1696+
/*
1697+
* Complete the request inline if possible. This requires that three
1698+
* conditions be met:
1699+
* 1. An event mask must have been passed. If a plain wakeup was done
1700+
* instead, then mask == 0 and we have to call vfs_poll() to get
1701+
* the events, so inline completion isn't possible.
1702+
* 2. The completion work must not have already been scheduled.
1703+
* 3. ctx_lock must not be busy. We have to use trylock because we
1704+
* already hold the waitqueue lock, so this inverts the normal
1705+
* locking order. Use irqsave/irqrestore because not all
1706+
* filesystems (e.g. fuse) call this function with IRQs disabled,
1707+
* yet IRQs have to be disabled before ctx_lock is obtained.
1708+
*/
1709+
if (mask && !req->work_scheduled &&
1710+
spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
16861711
struct kioctx *ctx = iocb->ki_ctx;
16871712

1688-
/*
1689-
* Try to complete the iocb inline if we can. Use
1690-
* irqsave/irqrestore because not all filesystems (e.g. fuse)
1691-
* call this function with IRQs disabled and because IRQs
1692-
* have to be disabled before ctx_lock is obtained.
1693-
*/
1713+
list_del_init(&req->wait.entry);
16941714
list_del(&iocb->ki_list);
16951715
iocb->ki_res.res = mangle_poll(mask);
1696-
req->done = true;
16971716
if (iocb->ki_eventfd && eventfd_signal_allowed()) {
16981717
iocb = NULL;
16991718
INIT_WORK(&req->work, aio_poll_put_work);
@@ -1703,7 +1722,20 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
17031722
if (iocb)
17041723
iocb_put(iocb);
17051724
} else {
1706-
schedule_work(&req->work);
1725+
/*
1726+
* Schedule the completion work if needed. If it was already
1727+
* scheduled, record that another wakeup came in.
1728+
*
1729+
* Don't remove the request from the waitqueue here, as it might
1730+
* not actually be complete yet (we won't know until vfs_poll()
1731+
* is called), and we must not miss any wakeups.
1732+
*/
1733+
if (req->work_scheduled) {
1734+
req->work_need_resched = true;
1735+
} else {
1736+
schedule_work(&req->work);
1737+
req->work_scheduled = true;
1738+
}
17071739
}
17081740
return 1;
17091741
}
@@ -1750,8 +1782,9 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
17501782
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
17511783

17521784
req->head = NULL;
1753-
req->done = false;
17541785
req->cancelled = false;
1786+
req->work_scheduled = false;
1787+
req->work_need_resched = false;
17551788

17561789
apt.pt._qproc = aio_poll_queue_proc;
17571790
apt.pt._key = req->events;
@@ -1766,17 +1799,27 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
17661799
spin_lock_irq(&ctx->ctx_lock);
17671800
if (likely(req->head)) {
17681801
spin_lock(&req->head->lock);
1769-
if (unlikely(list_empty(&req->wait.entry))) {
1770-
if (apt.error)
1802+
if (list_empty(&req->wait.entry) || req->work_scheduled) {
1803+
/*
1804+
* aio_poll_wake() already either scheduled the async
1805+
* completion work, or completed the request inline.
1806+
*/
1807+
if (apt.error) /* unsupported case: multiple queues */
17711808
cancel = true;
17721809
apt.error = 0;
17731810
mask = 0;
17741811
}
17751812
if (mask || apt.error) {
1813+
/* Steal to complete synchronously. */
17761814
list_del_init(&req->wait.entry);
17771815
} else if (cancel) {
1816+
/* Cancel if possible (may be too late though). */
17781817
WRITE_ONCE(req->cancelled, true);
1779-
} else if (!req->done) { /* actually waiting for an event */
1818+
} else if (!list_empty(&req->wait.entry)) {
1819+
/*
1820+
* Actually waiting for an event, so add the request to
1821+
* active_reqs so that it can be cancelled if needed.
1822+
*/
17801823
list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
17811824
aiocb->ki_cancel = aio_poll_cancel;
17821825
}

0 commit comments

Comments
 (0)