Skip to content

Commit 6f2a71a

Browse files
committed
Merge tag 'bcachefs-2025-06-26' of git://evilpiepirate.org/bcachefs
Pull bcachefs fixes from Kent Overstreet: - Lots of small check/repair fixes, primarily in subvol loop and directory structure loop (when involving snapshots). - Fix a few 6.16 regressions: rare UAF in the foreground allocator path when taking a transaction restart from the transaction bump allocator, and some small fallout from the change to log the error being corrected in the journal when repairing errors, also some fallout from the btree node read error logging improvements. (Alan, Bharadwaj) - New option: journal_rewind This lets the entire filesystem be reset to an earlier point in time. Note that this is only a disaster recovery tool, and right now there are major caveats to using it (discards should be disabled, in particular), but it successfully restored the filesystem of one of the users who was bit by the subvolume deletion bug and didn't have backups. I'll likely be making some changes to the discard path in the future to make this a reliable recovery tool. - Some new btree iterator tracepoints, for tracking down some livelock-ish behaviour we've been seeing in the main data write path. * tag 'bcachefs-2025-06-26' of git://evilpiepirate.org/bcachefs: (51 commits) bcachefs: Plumb correct ip to trans_relock_fail tracepoint bcachefs: Ensure we rewind to run recovery passes bcachefs: Ensure btree node scan runs before checking for scanned nodes bcachefs: btree_root_unreadable_and_scan_found_nothing should not be autofix bcachefs: fix bch2_journal_keys_peek_prev_min() underflow bcachefs: Use wait_on_allocator() when allocating journal bcachefs: Check for bad write buffer key when moving from journal bcachefs: Don't unlock the trans if ret doesn't match BCH_ERR_operation_blocked bcachefs: Fix range in bch2_lookup_indirect_extent() error path bcachefs: fix spurious error_throw bcachefs: Add missing bch2_err_class() to fileattr_set() bcachefs: Add missing key type checks to check_snapshot_exists() bcachefs: Don't log fsck err in the journal if doing repair elsewhere bcachefs: Fix *__bch2_trans_subbuf_alloc() error path bcachefs: Fix missing newlines before ero bcachefs: fix spurious error in read_btree_roots() bcachefs: fsck: Fix oops in key_visible_in_snapshot() bcachefs: fsck: fix unhandled restart in topology repair bcachefs: fsck: Fix check_directory_structure when no check_dirents bcachefs: Fix restart handling in btree_node_scrub_work() ...
2 parents 8a20830 + ef6fac0 commit 6f2a71a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+734
-451
lines changed

fs/bcachefs/alloc_background.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1406,6 +1406,9 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
14061406
: BCH_DATA_free;
14071407
struct printbuf buf = PRINTBUF;
14081408

1409+
unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)|
1410+
FSCK_CAN_FIX|FSCK_CAN_IGNORE;
1411+
14091412
struct bpos bucket = iter->pos;
14101413
bucket.offset &= ~(~0ULL << 56);
14111414
u64 genbits = iter->pos.offset & (~0ULL << 56);
@@ -1419,9 +1422,10 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
14191422
return ret;
14201423

14211424
if (!bch2_dev_bucket_exists(c, bucket)) {
1422-
if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket,
1423-
"entry in %s btree for nonexistant dev:bucket %llu:%llu",
1424-
bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
1425+
if (__fsck_err(trans, fsck_flags,
1426+
need_discard_freespace_key_to_invalid_dev_bucket,
1427+
"entry in %s btree for nonexistant dev:bucket %llu:%llu",
1428+
bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
14251429
goto delete;
14261430
ret = 1;
14271431
goto out;
@@ -1433,7 +1437,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
14331437
if (a->data_type != state ||
14341438
(state == BCH_DATA_free &&
14351439
genbits != alloc_freespace_genbits(*a))) {
1436-
if (fsck_err(trans, need_discard_freespace_key_bad,
1440+
if (__fsck_err(trans, fsck_flags,
1441+
need_discard_freespace_key_bad,
14371442
"%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
14381443
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
14391444
bch2_btree_id_str(iter->btree_id),

fs/bcachefs/backpointers.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
353353
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
354354
} else {
355355
struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
356-
if (b == ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)))
356+
if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
357357
return bkey_s_c_null;
358358
if (IS_ERR_OR_NULL(b))
359359
return ((struct bkey_s_c) { .k = ERR_CAST(b) });

fs/bcachefs/bcachefs.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -767,7 +767,8 @@ struct btree_trans_buf {
767767
x(sysfs) \
768768
x(btree_write_buffer) \
769769
x(btree_node_scrub) \
770-
x(async_recovery_passes)
770+
x(async_recovery_passes) \
771+
x(ioctl_data)
771772

772773
enum bch_write_ref {
773774
#define x(n) BCH_WRITE_REF_##n,

fs/bcachefs/btree_gc.c

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -503,8 +503,14 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
503503
prt_newline(&buf);
504504
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
505505

506+
/*
507+
* XXX: we're not passing the trans object here because we're not set up
508+
* to handle a transaction restart - this code needs to be rewritten
509+
* when we start doing online topology repair
510+
*/
511+
bch2_trans_unlock_long(trans);
506512
if (mustfix_fsck_err_on(!have_child,
507-
trans, btree_node_topology_interior_node_empty,
513+
c, btree_node_topology_interior_node_empty,
508514
"empty interior btree node at %s", buf.buf))
509515
ret = DROP_THIS_NODE;
510516
err:
@@ -528,32 +534,39 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
528534
return ret;
529535
}
530536

531-
static int bch2_check_root(struct btree_trans *trans, enum btree_id i,
537+
static int bch2_check_root(struct btree_trans *trans, enum btree_id btree,
532538
bool *reconstructed_root)
533539
{
534540
struct bch_fs *c = trans->c;
535-
struct btree_root *r = bch2_btree_id_root(c, i);
541+
struct btree_root *r = bch2_btree_id_root(c, btree);
536542
struct printbuf buf = PRINTBUF;
537543
int ret = 0;
538544

539-
bch2_btree_id_to_text(&buf, i);
545+
bch2_btree_id_to_text(&buf, btree);
540546

541547
if (r->error) {
542548
bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
543549

544-
r->alive = false;
545-
r->error = 0;
550+
ret = bch2_btree_has_scanned_nodes(c, btree);
551+
if (ret < 0)
552+
goto err;
546553

547-
if (!bch2_btree_has_scanned_nodes(c, i)) {
554+
if (!ret) {
548555
__fsck_err(trans,
549-
FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0),
556+
FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0),
550557
btree_root_unreadable_and_scan_found_nothing,
551558
"no nodes found for btree %s, continue?", buf.buf);
552-
bch2_btree_root_alloc_fake_trans(trans, i, 0);
559+
560+
r->alive = false;
561+
r->error = 0;
562+
bch2_btree_root_alloc_fake_trans(trans, btree, 0);
553563
} else {
554-
bch2_btree_root_alloc_fake_trans(trans, i, 1);
555-
bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
556-
ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
564+
r->alive = false;
565+
r->error = 0;
566+
bch2_btree_root_alloc_fake_trans(trans, btree, 1);
567+
568+
bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
569+
ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
557570
if (ret)
558571
goto err;
559572
}

fs/bcachefs/btree_io.c

Lines changed: 31 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,9 @@ static int __btree_err(int ret,
557557
const char *fmt, ...)
558558
{
559559
if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
560-
return bch_err_throw(c, fsck_fix);
560+
return ret == -BCH_ERR_btree_node_read_err_fixable
561+
? bch_err_throw(c, fsck_fix)
562+
: ret;
561563

562564
bool have_retry = false;
563565
int ret2;
@@ -723,12 +725,11 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
723725

724726
static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
725727
struct btree *b, struct bset *i,
726-
unsigned offset, unsigned sectors, int write,
728+
unsigned offset, int write,
727729
struct bch_io_failures *failed,
728730
struct printbuf *err_msg)
729731
{
730732
unsigned version = le16_to_cpu(i->version);
731-
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
732733
struct printbuf buf1 = PRINTBUF;
733734
struct printbuf buf2 = PRINTBUF;
734735
int ret = 0;
@@ -778,15 +779,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
778779
btree_node_unsupported_version,
779780
"BSET_SEPARATE_WHITEOUTS no longer supported");
780781

781-
if (!write &&
782-
btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
783-
-BCH_ERR_btree_node_read_err_fixable,
784-
c, ca, b, i, NULL,
785-
bset_past_end_of_btree_node,
786-
"bset past end of btree node (offset %u len %u but written %zu)",
787-
offset, sectors, ptr_written ?: btree_sectors(c)))
788-
i->u64s = 0;
789-
790782
btree_err_on(offset && !i->u64s,
791783
-BCH_ERR_btree_node_read_err_fixable,
792784
c, ca, b, i, NULL,
@@ -1151,6 +1143,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
11511143
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
11521144

11531145
if (first) {
1146+
sectors = vstruct_sectors(b->data, c->block_bits);
1147+
if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
1148+
-BCH_ERR_btree_node_read_err_fixable,
1149+
c, ca, b, i, NULL,
1150+
bset_past_end_of_btree_node,
1151+
"bset past end of btree node (offset %u len %u but written %zu)",
1152+
b->written, sectors, ptr_written ?: btree_sectors(c)))
1153+
i->u64s = 0;
11541154
if (good_csum_type) {
11551155
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
11561156
bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
@@ -1178,9 +1178,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
11781178
c, NULL, b, NULL, NULL,
11791179
btree_node_unsupported_version,
11801180
"btree node does not have NEW_EXTENT_OVERWRITE set");
1181-
1182-
sectors = vstruct_sectors(b->data, c->block_bits);
11831181
} else {
1182+
sectors = vstruct_sectors(bne, c->block_bits);
1183+
if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
1184+
-BCH_ERR_btree_node_read_err_fixable,
1185+
c, ca, b, i, NULL,
1186+
bset_past_end_of_btree_node,
1187+
"bset past end of btree node (offset %u len %u but written %zu)",
1188+
b->written, sectors, ptr_written ?: btree_sectors(c)))
1189+
i->u64s = 0;
11841190
if (good_csum_type) {
11851191
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
11861192
bool csum_bad = bch2_crc_cmp(bne->csum, csum);
@@ -1201,14 +1207,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
12011207
"decrypting btree node: %s", bch2_err_str(ret)))
12021208
goto fsck_err;
12031209
}
1204-
1205-
sectors = vstruct_sectors(bne, c->block_bits);
12061210
}
12071211

12081212
b->version_ondisk = min(b->version_ondisk,
12091213
le16_to_cpu(i->version));
12101214

1211-
ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg);
1215+
ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg);
12121216
if (ret)
12131217
goto fsck_err;
12141218

@@ -1982,28 +1986,12 @@ static void btree_node_scrub_work(struct work_struct *work)
19821986
prt_newline(&err);
19831987

19841988
if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
1985-
struct btree_trans *trans = bch2_trans_get(c);
1986-
1987-
struct btree_iter iter;
1988-
bch2_trans_node_iter_init(trans, &iter, scrub->btree,
1989-
scrub->key.k->k.p, 0, scrub->level - 1, 0);
1990-
1991-
struct btree *b;
1992-
int ret = lockrestart_do(trans,
1993-
PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter)));
1994-
if (ret)
1995-
goto err;
1996-
1997-
if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) {
1998-
bch_err(c, "error validating btree node during scrub on %s at btree %s",
1999-
scrub->ca->name, err.buf);
2000-
2001-
ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0);
2002-
}
2003-
err:
2004-
bch2_trans_iter_exit(trans, &iter);
2005-
bch2_trans_begin(trans);
2006-
bch2_trans_put(trans);
1989+
int ret = bch2_trans_do(c,
1990+
bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1,
1991+
scrub->key.k, 0));
1992+
if (!bch2_err_matches(ret, ENOENT) &&
1993+
!bch2_err_matches(ret, EROFS))
1994+
bch_err_fn_ratelimited(c, ret);
20071995
}
20081996

20091997
printbuf_exit(&err);
@@ -2267,7 +2255,7 @@ static void btree_node_write_endio(struct bio *bio)
22672255
}
22682256

22692257
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
2270-
struct bset *i, unsigned sectors)
2258+
struct bset *i)
22712259
{
22722260
int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
22732261
(struct bkey_validate_context) {
@@ -2282,7 +2270,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
22822270
}
22832271

22842272
ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?:
2285-
validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL);
2273+
validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL);
22862274
if (ret) {
22872275
bch2_inconsistent_error(c);
22882276
dump_stack();
@@ -2475,7 +2463,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
24752463

24762464
/* if we're going to be encrypting, check metadata validity first: */
24772465
if (validate_before_checksum &&
2478-
validate_bset_for_write(c, b, i, sectors_to_write))
2466+
validate_bset_for_write(c, b, i))
24792467
goto err;
24802468

24812469
ret = bset_encrypt(c, i, b->written << 9);
@@ -2492,7 +2480,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
24922480

24932481
/* if we're not encrypting, check metadata after checksumming: */
24942482
if (!validate_before_checksum &&
2495-
validate_bset_for_write(c, b, i, sectors_to_write))
2483+
validate_bset_for_write(c, b, i))
24962484
goto err;
24972485

24982486
/*

0 commit comments

Comments
 (0)