Skip to content

Commit 2a9127f

Browse files
committed
mm: rewrite wait_on_page_bit_common() logic
It turns out that wait_on_page_bit_common() had several problems, ranging from just unfair behavioe due to re-queueing at the end of the wait queue when re-trying, and an outright bug that could result in missed wakeups (but probably never happened in practice). This rewrites the whole logic to avoid both issues, by simply moving the logic to check (and possibly take) the bit lock into the wakeup path instead. That makes everything much more straightforward, and means that we never need to re-queue the wait entry: if we get woken up, we'll be notified through WQ_FLAG_WOKEN, and the wait queue entry will have been removed, and everything will have been done for us. Link: https://lore.kernel.org/lkml/CAHk-=wjJA2Z3kUFb-5s=6+n0qbTs8ELqKFt9B3pH85a8fGD73w@mail.gmail.com/ Link: https://lore.kernel.org/lkml/[email protected]/ Reported-by: Oleg Nesterov <[email protected]> Reported-by: Hugh Dickins <[email protected]> Cc: Michal Hocko <[email protected]> Reviewed-by: Oleg Nesterov <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent bcf8768 commit 2a9127f

File tree

1 file changed

+85
-47
lines changed

1 file changed

+85
-47
lines changed

mm/filemap.c

Lines changed: 85 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,6 +1002,7 @@ struct wait_page_queue {
10021002

10031003
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
10041004
{
1005+
int ret;
10051006
struct wait_page_key *key = arg;
10061007
struct wait_page_queue *wait_page
10071008
= container_of(wait, struct wait_page_queue, wait);
@@ -1014,17 +1015,40 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
10141015
return 0;
10151016

10161017
/*
1017-
* Stop walking if it's locked.
1018-
* Is this safe if put_and_wait_on_page_locked() is in use?
1019-
* Yes: the waker must hold a reference to this page, and if PG_locked
1020-
* has now already been set by another task, that task must also hold
1021-
* a reference to the *same usage* of this page; so there is no need
1022-
* to walk on to wake even the put_and_wait_on_page_locked() callers.
1018+
* If it's an exclusive wait, we get the bit for it, and
1019+
* stop walking if we can't.
1020+
*
1021+
* If it's a non-exclusive wait, then the fact that this
1022+
* wake function was called means that the bit already
1023+
* was cleared, and we don't care if somebody then
1024+
* re-took it.
10231025
*/
1024-
if (test_bit(key->bit_nr, &key->page->flags))
1025-
return -1;
1026+
ret = 0;
1027+
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
1028+
if (test_and_set_bit(key->bit_nr, &key->page->flags))
1029+
return -1;
1030+
ret = 1;
1031+
}
1032+
wait->flags |= WQ_FLAG_WOKEN;
10261033

1027-
return autoremove_wake_function(wait, mode, sync, key);
1034+
wake_up_state(wait->private, mode);
1035+
1036+
/*
1037+
* Ok, we have successfully done what we're waiting for,
1038+
* and we can unconditionally remove the wait entry.
1039+
*
1040+
* Note that this has to be the absolute last thing we do,
1041+
* since after list_del_init(&wait->entry) the wait entry
1042+
* might be de-allocated and the process might even have
1043+
* exited.
1044+
*
1045+
* We _really_ should have a "list_del_init_careful()" to
1046+
* properly pair with the unlocked "list_empty_careful()"
1047+
* in finish_wait().
1048+
*/
1049+
smp_mb();
1050+
list_del_init(&wait->entry);
1051+
return ret;
10281052
}
10291053

10301054
static void wake_up_page_bit(struct page *page, int bit_nr)
@@ -1103,16 +1127,31 @@ enum behavior {
11031127
*/
11041128
};
11051129

1130+
/*
1131+
* Attempt to check (or get) the page bit, and mark the
1132+
* waiter woken if successful.
1133+
*/
1134+
static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
1135+
struct wait_queue_entry *wait)
1136+
{
1137+
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
1138+
if (test_and_set_bit(bit_nr, &page->flags))
1139+
return false;
1140+
} else if (test_bit(bit_nr, &page->flags))
1141+
return false;
1142+
1143+
wait->flags |= WQ_FLAG_WOKEN;
1144+
return true;
1145+
}
1146+
11061147
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
11071148
struct page *page, int bit_nr, int state, enum behavior behavior)
11081149
{
11091150
struct wait_page_queue wait_page;
11101151
wait_queue_entry_t *wait = &wait_page.wait;
1111-
bool bit_is_set;
11121152
bool thrashing = false;
11131153
bool delayacct = false;
11141154
unsigned long pflags;
1115-
int ret = 0;
11161155

11171156
if (bit_nr == PG_locked &&
11181157
!PageUptodate(page) && PageWorkingset(page)) {
@@ -1130,48 +1169,47 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
11301169
wait_page.page = page;
11311170
wait_page.bit_nr = bit_nr;
11321171

1133-
for (;;) {
1134-
spin_lock_irq(&q->lock);
1172+
/*
1173+
* Do one last check whether we can get the
1174+
* page bit synchronously.
1175+
*
1176+
* Do the SetPageWaiters() marking before that
1177+
* to let any waker we _just_ missed know they
1178+
* need to wake us up (otherwise they'll never
1179+
* even go to the slow case that looks at the
1180+
* page queue), and add ourselves to the wait
1181+
* queue if we need to sleep.
1182+
*
1183+
* This part needs to be done under the queue
1184+
* lock to avoid races.
1185+
*/
1186+
spin_lock_irq(&q->lock);
1187+
SetPageWaiters(page);
1188+
if (!trylock_page_bit_common(page, bit_nr, wait))
1189+
__add_wait_queue_entry_tail(q, wait);
1190+
spin_unlock_irq(&q->lock);
11351191

1136-
if (likely(list_empty(&wait->entry))) {
1137-
__add_wait_queue_entry_tail(q, wait);
1138-
SetPageWaiters(page);
1139-
}
1192+
/*
1193+
* From now on, all the logic will be based on
1194+
* the WQ_FLAG_WOKEN flag, and the and the page
1195+
* bit testing (and setting) will be - or has
1196+
* already been - done by the wake function.
1197+
*
1198+
* We can drop our reference to the page.
1199+
*/
1200+
if (behavior == DROP)
1201+
put_page(page);
11401202

1203+
for (;;) {
11411204
set_current_state(state);
11421205

1143-
spin_unlock_irq(&q->lock);
1144-
1145-
bit_is_set = test_bit(bit_nr, &page->flags);
1146-
if (behavior == DROP)
1147-
put_page(page);
1148-
1149-
if (likely(bit_is_set))
1150-
io_schedule();
1151-
1152-
if (behavior == EXCLUSIVE) {
1153-
if (!test_and_set_bit_lock(bit_nr, &page->flags))
1154-
break;
1155-
} else if (behavior == SHARED) {
1156-
if (!test_bit(bit_nr, &page->flags))
1157-
break;
1158-
}
1159-
1160-
if (signal_pending_state(state, current)) {
1161-
ret = -EINTR;
1206+
if (signal_pending_state(state, current))
11621207
break;
1163-
}
11641208

1165-
if (behavior == DROP) {
1166-
/*
1167-
* We can no longer safely access page->flags:
1168-
* even if CONFIG_MEMORY_HOTREMOVE is not enabled,
1169-
* there is a risk of waiting forever on a page reused
1170-
* for something that keeps it locked indefinitely.
1171-
* But best check for -EINTR above before breaking.
1172-
*/
1209+
if (wait->flags & WQ_FLAG_WOKEN)
11731210
break;
1174-
}
1211+
1212+
io_schedule();
11751213
}
11761214

11771215
finish_wait(q, wait);
@@ -1190,7 +1228,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
11901228
* bother with signals either.
11911229
*/
11921230

1193-
return ret;
1231+
return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
11941232
}
11951233

11961234
void wait_on_page_bit(struct page *page, int bit_nr)

0 commit comments

Comments
 (0)