Skip to content

Commit 7a51608

Browse files
author
Kent Overstreet
committed
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check references to another: extents <-> backpointers Checking references generates random lookups, so we want to pin that btree in memory (or only a range, if it doesn't fit in ram). Previously, this was done with a simple check in the shrinker - "if btree node is in range being pinned, don't free it" - but this generated OOMs, as our shrinker wasn't well behaved if there was less memory available than expected. Instead, we now have two different shrinkers and lru lists; the second shrinker being for pinned nodes, with seeks set much higher than normal - so they can still be freed if necessary, but we'll prefer not to. Signed-off-by: Kent Overstreet <[email protected]>
1 parent 91ddd71 commit 7a51608

File tree

7 files changed

+150
-75
lines changed

7 files changed

+150
-75
lines changed

fs/bcachefs/backpointers.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -752,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
752752
s64 mem_may_pin = mem_may_pin_bytes(c);
753753
int ret = 0;
754754

755+
bch2_btree_cache_unpin(c);
756+
755757
btree_interior_mask |= btree_leaf_mask;
756758

757-
c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
758-
c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
759+
c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
760+
c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
759761
c->btree_cache.pinned_nodes_start = start;
760762
c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
761763

@@ -777,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
777779
BBPOS(btree, b->key.k.p);
778780
break;
779781
}
782+
bch2_node_pin(c, b);
780783
0;
781784
}));
782785
}
@@ -936,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
936939
bch2_trans_put(trans);
937940
bch2_bkey_buf_exit(&s.last_flushed, c);
938941

939-
c->btree_cache.pinned_nodes_leaf_mask = 0;
940-
c->btree_cache.pinned_nodes_interior_mask = 0;
942+
bch2_btree_cache_unpin(c);
941943

942944
bch_err_fn(c, ret);
943945
return ret;
@@ -1053,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
10531055
}
10541056
bch2_trans_put(trans);
10551057

1056-
c->btree_cache.pinned_nodes_leaf_mask = 0;
1057-
c->btree_cache.pinned_nodes_interior_mask = 0;
1058+
bch2_btree_cache_unpin(c);
10581059

10591060
bch_err_fn(c, ret);
10601061
return ret;

fs/bcachefs/btree_cache.c

Lines changed: 112 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,14 @@ void bch2_recalc_btree_reserve(struct bch_fs *c)
4747
c->btree_cache.nr_reserve = reserve;
4848
}
4949

50-
static inline size_t btree_cache_can_free(struct btree_cache *bc)
50+
static inline size_t btree_cache_can_free(struct btree_cache_list *list)
5151
{
52-
return max_t(int, 0, bc->nr_live + bc->nr_freeable - bc->nr_reserve);
52+
struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
53+
54+
size_t can_free = list->nr;
55+
if (!list->idx)
56+
can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
57+
return can_free;
5358
}
5459

5560
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
@@ -184,6 +189,51 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
184189
six_unlock_intent(&b->c.lock);
185190
}
186191

192+
static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
193+
{
194+
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
195+
196+
u64 mask = bc->pinned_nodes_mask[!!b->c.level];
197+
198+
return ((mask & BIT_ULL(b->c.btree_id)) &&
199+
bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
200+
bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
201+
}
202+
203+
void bch2_node_pin(struct bch_fs *c, struct btree *b)
204+
{
205+
struct btree_cache *bc = &c->btree_cache;
206+
207+
mutex_lock(&bc->lock);
208+
BUG_ON(!__btree_node_pinned(bc, b));
209+
if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
210+
set_btree_node_pinned(b);
211+
list_move(&b->list, &bc->live[1].list);
212+
bc->live[0].nr--;
213+
bc->live[1].nr++;
214+
}
215+
mutex_unlock(&bc->lock);
216+
}
217+
218+
void bch2_btree_cache_unpin(struct bch_fs *c)
219+
{
220+
struct btree_cache *bc = &c->btree_cache;
221+
struct btree *b, *n;
222+
223+
mutex_lock(&bc->lock);
224+
c->btree_cache.pinned_nodes_mask[0] = 0;
225+
c->btree_cache.pinned_nodes_mask[1] = 0;
226+
227+
list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
228+
clear_btree_node_pinned(b);
229+
list_move(&b->list, &bc->live[0].list);
230+
bc->live[0].nr++;
231+
bc->live[1].nr--;
232+
}
233+
234+
mutex_unlock(&bc->lock);
235+
}
236+
187237
/* Btree in memory cache - hash table */
188238

189239
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
@@ -199,7 +249,7 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
199249
if (b->c.btree_id < BTREE_ID_NR)
200250
--bc->nr_by_btree[b->c.btree_id];
201251

202-
bc->nr_live--;
252+
bc->live[btree_node_pinned(b)].nr--;
203253
bc->nr_freeable++;
204254
list_move(&b->list, &bc->freeable);
205255
}
@@ -216,9 +266,14 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
216266

217267
if (b->c.btree_id < BTREE_ID_NR)
218268
bc->nr_by_btree[b->c.btree_id]++;
219-
bc->nr_live++;
269+
270+
bool p = __btree_node_pinned(bc, b);
271+
mod_bit(BTREE_NODE_pinned, &b->flags, p);
272+
273+
list_move_tail(&b->list, &bc->live[p].list);
274+
bc->live[p].nr++;
275+
220276
bc->nr_freeable--;
221-
list_move_tail(&b->list, &bc->live);
222277
return 0;
223278
}
224279

@@ -283,20 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b
283338
int ret = 0;
284339

285340
lockdep_assert_held(&bc->lock);
286-
287-
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
288-
289-
u64 mask = b->c.level
290-
? bc->pinned_nodes_interior_mask
291-
: bc->pinned_nodes_leaf_mask;
292-
293-
if ((mask & BIT_ULL(b->c.btree_id)) &&
294-
bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
295-
bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) {
296-
BTREE_CACHE_NOT_FREED_INCREMENT(pinned);
297-
return -BCH_ERR_ENOMEM_btree_node_reclaim;
298-
}
299-
300341
wait_on_io:
301342
if (b->flags & ((1U << BTREE_NODE_dirty)|
302343
(1U << BTREE_NODE_read_in_flight)|
@@ -401,17 +442,17 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
401442
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
402443
struct shrink_control *sc)
403444
{
404-
struct bch_fs *c = shrink->private_data;
405-
struct btree_cache *bc = &c->btree_cache;
445+
struct btree_cache_list *list = shrink->private_data;
446+
struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
447+
struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
406448
struct btree *b, *t;
407449
unsigned long nr = sc->nr_to_scan;
408450
unsigned long can_free = 0;
409451
unsigned long freed = 0;
410452
unsigned long touched = 0;
411453
unsigned i, flags;
412454
unsigned long ret = SHRINK_STOP;
413-
bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >=
414-
(bc->nr_live + bc->nr_freeable) * 3 / 4;
455+
bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
415456

416457
if (bch2_btree_shrinker_disabled)
417458
return SHRINK_STOP;
@@ -426,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
426467
* succeed, so that inserting keys into the btree can always succeed and
427468
* IO can always make forward progress:
428469
*/
429-
can_free = btree_cache_can_free(bc);
470+
can_free = btree_cache_can_free(list);
430471
nr = min_t(unsigned long, nr, can_free);
431472

432473
i = 0;
@@ -452,7 +493,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
452493
}
453494
}
454495
restart:
455-
list_for_each_entry_safe(b, t, &bc->live, list) {
496+
list_for_each_entry_safe(b, t, &list->list, list) {
456497
touched++;
457498

458499
if (btree_node_accessed(b)) {
@@ -476,7 +517,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
476517
!btree_node_will_make_reachable(b) &&
477518
!btree_node_write_blocked(b) &&
478519
six_trylock_read(&b->c.lock)) {
479-
list_move(&bc->live, &b->list);
520+
list_move(&list->list, &b->list);
480521
mutex_unlock(&bc->lock);
481522
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
482523
six_unlock_read(&b->c.lock);
@@ -490,8 +531,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
490531
break;
491532
}
492533
out_rotate:
493-
if (&t->list != &bc->live)
494-
list_move_tail(&bc->live, &t->list);
534+
if (&t->list != &list->list)
535+
list_move_tail(&list->list, &t->list);
495536
out:
496537
mutex_unlock(&bc->lock);
497538
out_nounlock:
@@ -504,40 +545,42 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
504545
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
505546
struct shrink_control *sc)
506547
{
507-
struct bch_fs *c = shrink->private_data;
508-
struct btree_cache *bc = &c->btree_cache;
548+
struct btree_cache_list *list = shrink->private_data;
509549

510550
if (bch2_btree_shrinker_disabled)
511551
return 0;
512552

513-
return btree_cache_can_free(bc);
553+
return btree_cache_can_free(list);
514554
}
515555

516556
void bch2_fs_btree_cache_exit(struct bch_fs *c)
517557
{
518558
struct btree_cache *bc = &c->btree_cache;
519559
struct btree *b, *t;
520-
unsigned i, flags;
560+
unsigned long flags;
521561

522-
shrinker_free(bc->shrink);
562+
shrinker_free(bc->live[1].shrink);
563+
shrinker_free(bc->live[0].shrink);
523564

524565
/* vfree() can allocate memory: */
525566
flags = memalloc_nofs_save();
526567
mutex_lock(&bc->lock);
527568

528569
if (c->verify_data)
529-
list_move(&c->verify_data->list, &bc->live);
570+
list_move(&c->verify_data->list, &bc->live[0].list);
530571

531572
kvfree(c->verify_ondisk);
532573

533-
for (i = 0; i < btree_id_nr_alive(c); i++) {
574+
for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
534575
struct btree_root *r = bch2_btree_id_root(c, i);
535576

536577
if (r->b)
537-
list_add(&r->b->list, &bc->live);
578+
list_add(&r->b->list, &bc->live[0].list);
538579
}
539580

540-
list_for_each_entry_safe(b, t, &bc->live, list)
581+
list_for_each_entry_safe(b, t, &bc->live[1].list, list)
582+
bch2_btree_node_hash_remove(bc, b);
583+
list_for_each_entry_safe(b, t, &bc->live[0].list, list)
541584
bch2_btree_node_hash_remove(bc, b);
542585

543586
list_for_each_entry_safe(b, t, &bc->freeable, list) {
@@ -563,7 +606,8 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
563606

564607
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
565608
BUG_ON(bc->nr_by_btree[i]);
566-
BUG_ON(bc->nr_live);
609+
BUG_ON(bc->live[0].nr);
610+
BUG_ON(bc->live[1].nr);
567611
BUG_ON(bc->nr_freeable);
568612

569613
if (bc->table_init_done)
@@ -589,18 +633,28 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
589633
if (!__bch2_btree_node_mem_alloc(c))
590634
goto err;
591635

592-
list_splice_init(&bc->live, &bc->freeable);
636+
list_splice_init(&bc->live[0].list, &bc->freeable);
593637

594638
mutex_init(&c->verify_lock);
595639

596640
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
597641
if (!shrink)
598642
goto err;
599-
bc->shrink = shrink;
643+
bc->live[0].shrink = shrink;
644+
shrink->count_objects = bch2_btree_cache_count;
645+
shrink->scan_objects = bch2_btree_cache_scan;
646+
shrink->seeks = 2;
647+
shrink->private_data = &bc->live[0];
648+
shrinker_register(shrink);
649+
650+
shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
651+
if (!shrink)
652+
goto err;
653+
bc->live[1].shrink = shrink;
600654
shrink->count_objects = bch2_btree_cache_count;
601655
shrink->scan_objects = bch2_btree_cache_scan;
602-
shrink->seeks = 4;
603-
shrink->private_data = c;
656+
shrink->seeks = 8;
657+
shrink->private_data = &bc->live[1];
604658
shrinker_register(shrink);
605659

606660
return 0;
@@ -611,7 +665,10 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
611665
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
612666
{
613667
mutex_init(&bc->lock);
614-
INIT_LIST_HEAD(&bc->live);
668+
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
669+
bc->live[i].idx = i;
670+
INIT_LIST_HEAD(&bc->live[i].list);
671+
}
615672
INIT_LIST_HEAD(&bc->freeable);
616673
INIT_LIST_HEAD(&bc->freed_pcpu);
617674
INIT_LIST_HEAD(&bc->freed_nonpcpu);
@@ -673,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
673730
struct btree_cache *bc = &c->btree_cache;
674731
struct btree *b;
675732

676-
list_for_each_entry_reverse(b, &bc->live, list)
677-
if (!btree_node_reclaim(c, b, false))
678-
return b;
733+
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
734+
list_for_each_entry_reverse(b, &bc->live[i].list, list)
735+
if (!btree_node_reclaim(c, b, false))
736+
return b;
679737

680738
while (1) {
681-
list_for_each_entry_reverse(b, &bc->live, list)
682-
if (!btree_node_write_and_reclaim(c, b))
683-
return b;
739+
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
740+
list_for_each_entry_reverse(b, &bc->live[i].list, list)
741+
if (!btree_node_write_and_reclaim(c, b))
742+
return b;
684743

685744
/*
686745
* Rare case: all nodes were intent-locked.
@@ -1387,9 +1446,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
13871446
if (!out->nr_tabstops)
13881447
printbuf_tabstop_push(out, 32);
13891448

1390-
prt_btree_cache_line(out, c, "nr_live:", bc->nr_live);
1391-
prt_btree_cache_line(out, c, "nr_freeable:", bc->nr_freeable);
1392-
prt_btree_cache_line(out, c, "nr dirty:", atomic_long_read(&bc->nr_dirty));
1449+
prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
1450+
prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
1451+
prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
1452+
prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
13931453
prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
13941454
prt_newline(out);
13951455

fs/bcachefs/btree_cache.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
1919
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
2020
unsigned, enum btree_id);
2121

22+
void bch2_node_pin(struct bch_fs *, struct btree *);
23+
void bch2_btree_cache_unpin(struct bch_fs *);
24+
2225
void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
2326
struct bkey_s_c, struct bkey_i *);
2427

0 commit comments

Comments
 (0)