@@ -1620,6 +1620,51 @@ static void aio_poll_put_work(struct work_struct *work)
1620
1620
iocb_put (iocb );
1621
1621
}
1622
1622
1623
+ /*
1624
+ * Safely lock the waitqueue which the request is on, synchronizing with the
1625
+ * case where the ->poll() provider decides to free its waitqueue early.
1626
+ *
1627
+ * Returns true on success, meaning that req->head->lock was locked, req->wait
1628
+ * is on req->head, and an RCU read lock was taken. Returns false if the
1629
+ * request was already removed from its waitqueue (which might no longer exist).
1630
+ */
1631
+ static bool poll_iocb_lock_wq (struct poll_iocb * req )
1632
+ {
1633
+ wait_queue_head_t * head ;
1634
+
1635
+ /*
1636
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
1637
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
1638
+ * lock in the first place can race with the waitqueue being freed.
1639
+ *
1640
+ * We solve this as eventpoll does: by taking advantage of the fact that
1641
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
1642
+ * we enter rcu_read_lock() and see that the pointer to the queue is
1643
+ * non-NULL, we can then lock it without the memory being freed out from
1644
+ * under us, then check whether the request is still on the queue.
1645
+ *
1646
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
1647
+ * case the caller deletes the entry from the queue, leaving it empty.
1648
+ * In that case, only RCU prevents the queue memory from being freed.
1649
+ */
1650
+ rcu_read_lock ();
1651
+ head = smp_load_acquire (& req -> head );
1652
+ if (head ) {
1653
+ spin_lock (& head -> lock );
1654
+ if (!list_empty (& req -> wait .entry ))
1655
+ return true;
1656
+ spin_unlock (& head -> lock );
1657
+ }
1658
+ rcu_read_unlock ();
1659
+ return false;
1660
+ }
1661
+
1662
+ static void poll_iocb_unlock_wq (struct poll_iocb * req )
1663
+ {
1664
+ spin_unlock (& req -> head -> lock );
1665
+ rcu_read_unlock ();
1666
+ }
1667
+
1623
1668
static void aio_poll_complete_work (struct work_struct * work )
1624
1669
{
1625
1670
struct poll_iocb * req = container_of (work , struct poll_iocb , work );
@@ -1639,24 +1684,25 @@ static void aio_poll_complete_work(struct work_struct *work)
1639
1684
* avoid further branches in the fast path.
1640
1685
*/
1641
1686
spin_lock_irq (& ctx -> ctx_lock );
1642
- spin_lock (& req -> head -> lock );
1643
- if (!mask && !READ_ONCE (req -> cancelled )) {
1644
- /*
1645
- * The request isn't actually ready to be completed yet.
1646
- * Reschedule completion if another wakeup came in.
1647
- */
1648
- if (req -> work_need_resched ) {
1649
- schedule_work (& req -> work );
1650
- req -> work_need_resched = false;
1651
- } else {
1652
- req -> work_scheduled = false;
1687
+ if (poll_iocb_lock_wq (req )) {
1688
+ if (!mask && !READ_ONCE (req -> cancelled )) {
1689
+ /*
1690
+ * The request isn't actually ready to be completed yet.
1691
+ * Reschedule completion if another wakeup came in.
1692
+ */
1693
+ if (req -> work_need_resched ) {
1694
+ schedule_work (& req -> work );
1695
+ req -> work_need_resched = false;
1696
+ } else {
1697
+ req -> work_scheduled = false;
1698
+ }
1699
+ poll_iocb_unlock_wq (req );
1700
+ spin_unlock_irq (& ctx -> ctx_lock );
1701
+ return ;
1653
1702
}
1654
- spin_unlock (& req -> head -> lock );
1655
- spin_unlock_irq (& ctx -> ctx_lock );
1656
- return ;
1657
- }
1658
- list_del_init (& req -> wait .entry );
1659
- spin_unlock (& req -> head -> lock );
1703
+ list_del_init (& req -> wait .entry );
1704
+ poll_iocb_unlock_wq (req );
1705
+ } /* else, POLLFREE has freed the waitqueue, so we must complete */
1660
1706
list_del_init (& iocb -> ki_list );
1661
1707
iocb -> ki_res .res = mangle_poll (mask );
1662
1708
spin_unlock_irq (& ctx -> ctx_lock );
@@ -1670,13 +1716,14 @@ static int aio_poll_cancel(struct kiocb *iocb)
1670
1716
struct aio_kiocb * aiocb = container_of (iocb , struct aio_kiocb , rw );
1671
1717
struct poll_iocb * req = & aiocb -> poll ;
1672
1718
1673
- spin_lock (& req -> head -> lock );
1674
- WRITE_ONCE (req -> cancelled , true);
1675
- if (!req -> work_scheduled ) {
1676
- schedule_work (& aiocb -> poll .work );
1677
- req -> work_scheduled = true;
1678
- }
1679
- spin_unlock (& req -> head -> lock );
1719
+ if (poll_iocb_lock_wq (req )) {
1720
+ WRITE_ONCE (req -> cancelled , true);
1721
+ if (!req -> work_scheduled ) {
1722
+ schedule_work (& aiocb -> poll .work );
1723
+ req -> work_scheduled = true;
1724
+ }
1725
+ poll_iocb_unlock_wq (req );
1726
+ } /* else, the request was force-cancelled by POLLFREE already */
1680
1727
1681
1728
return 0 ;
1682
1729
}
@@ -1728,21 +1775,45 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
1728
1775
*
1729
1776
* Don't remove the request from the waitqueue here, as it might
1730
1777
* not actually be complete yet (we won't know until vfs_poll()
1731
- * is called), and we must not miss any wakeups.
1778
+ * is called), and we must not miss any wakeups. POLLFREE is an
1779
+ * exception to this; see below.
1732
1780
*/
1733
1781
if (req -> work_scheduled ) {
1734
1782
req -> work_need_resched = true;
1735
1783
} else {
1736
1784
schedule_work (& req -> work );
1737
1785
req -> work_scheduled = true;
1738
1786
}
1787
+
1788
+ /*
1789
+ * If the waitqueue is being freed early but we can't complete
1790
+ * the request inline, we have to tear down the request as best
1791
+ * we can. That means immediately removing the request from its
1792
+ * waitqueue and preventing all further accesses to the
1793
+ * waitqueue via the request. We also need to schedule the
1794
+ * completion work (done above). Also mark the request as
1795
+ * cancelled, to potentially skip an unneeded call to ->poll().
1796
+ */
1797
+ if (mask & POLLFREE ) {
1798
+ WRITE_ONCE (req -> cancelled , true);
1799
+ list_del_init (& req -> wait .entry );
1800
+
1801
+ /*
1802
+ * Careful: this *must* be the last step, since as soon
1803
+ * as req->head is NULL'ed out, the request can be
1804
+ * completed and freed, since aio_poll_complete_work()
1805
+ * will no longer need to take the waitqueue lock.
1806
+ */
1807
+ smp_store_release (& req -> head , NULL );
1808
+ }
1739
1809
}
1740
1810
return 1 ;
1741
1811
}
1742
1812
1743
1813
struct aio_poll_table {
1744
1814
struct poll_table_struct pt ;
1745
1815
struct aio_kiocb * iocb ;
1816
+ bool queued ;
1746
1817
int error ;
1747
1818
};
1748
1819
@@ -1753,11 +1824,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
1753
1824
struct aio_poll_table * pt = container_of (p , struct aio_poll_table , pt );
1754
1825
1755
1826
/* multiple wait queues per file are not supported */
1756
- if (unlikely (pt -> iocb -> poll . head )) {
1827
+ if (unlikely (pt -> queued )) {
1757
1828
pt -> error = - EINVAL ;
1758
1829
return ;
1759
1830
}
1760
1831
1832
+ pt -> queued = true;
1761
1833
pt -> error = 0 ;
1762
1834
pt -> iocb -> poll .head = head ;
1763
1835
add_wait_queue (head , & pt -> iocb -> poll .wait );
@@ -1789,6 +1861,7 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
1789
1861
apt .pt ._qproc = aio_poll_queue_proc ;
1790
1862
apt .pt ._key = req -> events ;
1791
1863
apt .iocb = aiocb ;
1864
+ apt .queued = false;
1792
1865
apt .error = - EINVAL ; /* same as no support for IOCB_CMD_POLL */
1793
1866
1794
1867
/* initialized the list so that we can do list_empty checks */
@@ -1797,9 +1870,10 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
1797
1870
1798
1871
mask = vfs_poll (req -> file , & apt .pt ) & req -> events ;
1799
1872
spin_lock_irq (& ctx -> ctx_lock );
1800
- if (likely (req -> head )) {
1801
- spin_lock (& req -> head -> lock );
1802
- if (list_empty (& req -> wait .entry ) || req -> work_scheduled ) {
1873
+ if (likely (apt .queued )) {
1874
+ bool on_queue = poll_iocb_lock_wq (req );
1875
+
1876
+ if (!on_queue || req -> work_scheduled ) {
1803
1877
/*
1804
1878
* aio_poll_wake() already either scheduled the async
1805
1879
* completion work, or completed the request inline.
@@ -1815,15 +1889,16 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
1815
1889
} else if (cancel ) {
1816
1890
/* Cancel if possible (may be too late though). */
1817
1891
WRITE_ONCE (req -> cancelled , true);
1818
- } else if (! list_empty ( & req -> wait . entry ) ) {
1892
+ } else if (on_queue ) {
1819
1893
/*
1820
1894
* Actually waiting for an event, so add the request to
1821
1895
* active_reqs so that it can be cancelled if needed.
1822
1896
*/
1823
1897
list_add_tail (& aiocb -> ki_list , & ctx -> active_reqs );
1824
1898
aiocb -> ki_cancel = aio_poll_cancel ;
1825
1899
}
1826
- spin_unlock (& req -> head -> lock );
1900
+ if (on_queue )
1901
+ poll_iocb_unlock_wq (req );
1827
1902
}
1828
1903
if (mask ) { /* no async, we'd stolen it */
1829
1904
aiocb -> ki_res .res = mangle_poll (mask );
0 commit comments