Skip to content

Commit ee10f6f

Browse files
Christoph Hellwigcmaiolino
authored andcommitted
xfs: fix buffer lookup vs release race
Since commit 298f342 ("xfs: lockless buffer lookup") the buffer lookup fastpath is done without a hash-wide lock (then pag_buf_lock, now bc_lock) and only under RCU protection. But this means that nothing serializes lookups against the temporary 0 reference count for buffers that are added to the LRU after dropping the last regular reference, and a concurrent lookup would fail to find them. Fix this by doing all b_hold modifications under b_lock. We're already doing this for release so this "only" ~ doubles the b_lock round trips. We'll later look into the lockref infrastructure to optimize the number of lock round trips again. Signed-off-by: Christoph Hellwig <[email protected]> Reviewed-by: Darrick J. Wong <[email protected]> Signed-off-by: Carlos Maiolino <[email protected]>
1 parent 07eae0f commit ee10f6f

File tree

3 files changed

+54
-51
lines changed

3 files changed

+54
-51
lines changed

fs/xfs/xfs_buf.c

Lines changed: 48 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -127,15 +127,6 @@ __xfs_buf_ioacct_dec(
127127
}
128128
}
129129

130-
static inline void
131-
xfs_buf_ioacct_dec(
132-
struct xfs_buf *bp)
133-
{
134-
spin_lock(&bp->b_lock);
135-
__xfs_buf_ioacct_dec(bp);
136-
spin_unlock(&bp->b_lock);
137-
}
138-
139130
/*
140131
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
141132
* b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -171,9 +162,9 @@ xfs_buf_stale(
171162
atomic_set(&bp->b_lru_ref, 0);
172163
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
173164
(list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
174-
atomic_dec(&bp->b_hold);
165+
bp->b_hold--;
175166

176-
ASSERT(atomic_read(&bp->b_hold) >= 1);
167+
ASSERT(bp->b_hold >= 1);
177168
spin_unlock(&bp->b_lock);
178169
}
179170

@@ -229,14 +220,14 @@ _xfs_buf_alloc(
229220
*/
230221
flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
231222

232-
atomic_set(&bp->b_hold, 1);
223+
spin_lock_init(&bp->b_lock);
224+
bp->b_hold = 1;
233225
atomic_set(&bp->b_lru_ref, 1);
234226
init_completion(&bp->b_iowait);
235227
INIT_LIST_HEAD(&bp->b_lru);
236228
INIT_LIST_HEAD(&bp->b_list);
237229
INIT_LIST_HEAD(&bp->b_li_list);
238230
sema_init(&bp->b_sema, 0); /* held, no waiters */
239-
spin_lock_init(&bp->b_lock);
240231
bp->b_target = target;
241232
bp->b_mount = target->bt_mount;
242233
bp->b_flags = flags;
@@ -580,6 +571,20 @@ xfs_buf_find_lock(
580571
return 0;
581572
}
582573

574+
static bool
575+
xfs_buf_try_hold(
576+
struct xfs_buf *bp)
577+
{
578+
spin_lock(&bp->b_lock);
579+
if (bp->b_hold == 0) {
580+
spin_unlock(&bp->b_lock);
581+
return false;
582+
}
583+
bp->b_hold++;
584+
spin_unlock(&bp->b_lock);
585+
return true;
586+
}
587+
583588
static inline int
584589
xfs_buf_lookup(
585590
struct xfs_buf_cache *bch,
@@ -592,7 +597,7 @@ xfs_buf_lookup(
592597

593598
rcu_read_lock();
594599
bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
595-
if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
600+
if (!bp || !xfs_buf_try_hold(bp)) {
596601
rcu_read_unlock();
597602
return -ENOENT;
598603
}
@@ -655,7 +660,7 @@ xfs_buf_find_insert(
655660
spin_unlock(&bch->bc_lock);
656661
goto out_free_buf;
657662
}
658-
if (bp && atomic_inc_not_zero(&bp->b_hold)) {
663+
if (bp && xfs_buf_try_hold(bp)) {
659664
/* found an existing buffer */
660665
spin_unlock(&bch->bc_lock);
661666
error = xfs_buf_find_lock(bp, flags);
@@ -1037,18 +1042,26 @@ xfs_buf_hold(
10371042
struct xfs_buf *bp)
10381043
{
10391044
trace_xfs_buf_hold(bp, _RET_IP_);
1040-
atomic_inc(&bp->b_hold);
1045+
1046+
spin_lock(&bp->b_lock);
1047+
bp->b_hold++;
1048+
spin_unlock(&bp->b_lock);
10411049
}
10421050

10431051
static void
10441052
xfs_buf_rele_uncached(
10451053
struct xfs_buf *bp)
10461054
{
10471055
ASSERT(list_empty(&bp->b_lru));
1048-
if (atomic_dec_and_test(&bp->b_hold)) {
1049-
xfs_buf_ioacct_dec(bp);
1050-
xfs_buf_free(bp);
1056+
1057+
spin_lock(&bp->b_lock);
1058+
if (--bp->b_hold) {
1059+
spin_unlock(&bp->b_lock);
1060+
return;
10511061
}
1062+
__xfs_buf_ioacct_dec(bp);
1063+
spin_unlock(&bp->b_lock);
1064+
xfs_buf_free(bp);
10521065
}
10531066

10541067
static void
@@ -1058,51 +1071,40 @@ xfs_buf_rele_cached(
10581071
struct xfs_buftarg *btp = bp->b_target;
10591072
struct xfs_perag *pag = bp->b_pag;
10601073
struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
1061-
bool release;
10621074
bool freebuf = false;
10631075

10641076
trace_xfs_buf_rele(bp, _RET_IP_);
10651077

1066-
ASSERT(atomic_read(&bp->b_hold) > 0);
1067-
1068-
/*
1069-
* We grab the b_lock here first to serialise racing xfs_buf_rele()
1070-
* calls. The pag_buf_lock being taken on the last reference only
1071-
* serialises against racing lookups in xfs_buf_find(). IOWs, the second
1072-
* to last reference we drop here is not serialised against the last
1073-
* reference until we take bp->b_lock. Hence if we don't grab b_lock
1074-
* first, the last "release" reference can win the race to the lock and
1075-
* free the buffer before the second-to-last reference is processed,
1076-
* leading to a use-after-free scenario.
1077-
*/
10781078
spin_lock(&bp->b_lock);
1079-
release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
1080-
if (!release) {
1079+
ASSERT(bp->b_hold >= 1);
1080+
if (bp->b_hold > 1) {
10811081
/*
10821082
* Drop the in-flight state if the buffer is already on the LRU
10831083
* and it holds the only reference. This is racy because we
10841084
* haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
10851085
* ensures the decrement occurs only once per-buf.
10861086
*/
1087-
if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
1087+
if (--bp->b_hold == 1 && !list_empty(&bp->b_lru))
10881088
__xfs_buf_ioacct_dec(bp);
10891089
goto out_unlock;
10901090
}
10911091

1092-
/* the last reference has been dropped ... */
1092+
/* we are asked to drop the last reference */
1093+
spin_lock(&bch->bc_lock);
10931094
__xfs_buf_ioacct_dec(bp);
10941095
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
10951096
/*
1096-
* If the buffer is added to the LRU take a new reference to the
1097+
* If the buffer is added to the LRU, keep the reference to the
10971098
* buffer for the LRU and clear the (now stale) dispose list
1098-
* state flag
1099+
* state flag, else drop the reference.
10991100
*/
1100-
if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
1101+
if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
11011102
bp->b_state &= ~XFS_BSTATE_DISPOSE;
1102-
atomic_inc(&bp->b_hold);
1103-
}
1103+
else
1104+
bp->b_hold--;
11041105
spin_unlock(&bch->bc_lock);
11051106
} else {
1107+
bp->b_hold--;
11061108
/*
11071109
* most of the time buffers will already be removed from the
11081110
* LRU, so optimise that case by checking for the
@@ -1748,13 +1750,14 @@ xfs_buftarg_drain_rele(
17481750
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
17491751
struct list_head *dispose = arg;
17501752

1751-
if (atomic_read(&bp->b_hold) > 1) {
1753+
if (!spin_trylock(&bp->b_lock))
1754+
return LRU_SKIP;
1755+
if (bp->b_hold > 1) {
17521756
/* need to wait, so skip it this pass */
1757+
spin_unlock(&bp->b_lock);
17531758
trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
17541759
return LRU_SKIP;
17551760
}
1756-
if (!spin_trylock(&bp->b_lock))
1757-
return LRU_SKIP;
17581761

17591762
/*
17601763
* clear the LRU reference count so the buffer doesn't get
@@ -2093,7 +2096,7 @@ xfs_buf_delwri_queue(
20932096
*/
20942097
bp->b_flags |= _XBF_DELWRI_Q;
20952098
if (list_empty(&bp->b_list)) {
2096-
atomic_inc(&bp->b_hold);
2099+
xfs_buf_hold(bp);
20972100
list_add_tail(&bp->b_list, list);
20982101
}
20992102

fs/xfs/xfs_buf.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ struct xfs_buf {
168168

169169
xfs_daddr_t b_rhash_key; /* buffer cache index */
170170
int b_length; /* size of buffer in BBs */
171-
atomic_t b_hold; /* reference count */
171+
unsigned int b_hold; /* reference count */
172172
atomic_t b_lru_ref; /* lru reclaim ref count */
173173
xfs_buf_flags_t b_flags; /* status flags */
174174
struct semaphore b_sema; /* semaphore for lockables */

fs/xfs/xfs_trace.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
498498
__entry->dev = bp->b_target->bt_dev;
499499
__entry->bno = xfs_buf_daddr(bp);
500500
__entry->nblks = bp->b_length;
501-
__entry->hold = atomic_read(&bp->b_hold);
501+
__entry->hold = bp->b_hold;
502502
__entry->pincount = atomic_read(&bp->b_pin_count);
503503
__entry->lockval = bp->b_sema.count;
504504
__entry->flags = bp->b_flags;
@@ -569,7 +569,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
569569
__entry->bno = xfs_buf_daddr(bp);
570570
__entry->length = bp->b_length;
571571
__entry->flags = flags;
572-
__entry->hold = atomic_read(&bp->b_hold);
572+
__entry->hold = bp->b_hold;
573573
__entry->pincount = atomic_read(&bp->b_pin_count);
574574
__entry->lockval = bp->b_sema.count;
575575
__entry->caller_ip = caller_ip;
@@ -612,7 +612,7 @@ TRACE_EVENT(xfs_buf_ioerror,
612612
__entry->dev = bp->b_target->bt_dev;
613613
__entry->bno = xfs_buf_daddr(bp);
614614
__entry->length = bp->b_length;
615-
__entry->hold = atomic_read(&bp->b_hold);
615+
__entry->hold = bp->b_hold;
616616
__entry->pincount = atomic_read(&bp->b_pin_count);
617617
__entry->lockval = bp->b_sema.count;
618618
__entry->error = error;
@@ -656,7 +656,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
656656
__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
657657
__entry->buf_len = bip->bli_buf->b_length;
658658
__entry->buf_flags = bip->bli_buf->b_flags;
659-
__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
659+
__entry->buf_hold = bip->bli_buf->b_hold;
660660
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
661661
__entry->buf_lockval = bip->bli_buf->b_sema.count;
662662
__entry->li_flags = bip->bli_item.li_flags;
@@ -4978,7 +4978,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
49784978
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
49794979
__entry->bno = xfs_buf_daddr(bp);
49804980
__entry->nblks = bp->b_length;
4981-
__entry->hold = atomic_read(&bp->b_hold);
4981+
__entry->hold = bp->b_hold;
49824982
__entry->pincount = atomic_read(&bp->b_pin_count);
49834983
__entry->lockval = bp->b_sema.count;
49844984
__entry->flags = bp->b_flags;

0 commit comments

Comments
 (0)