Skip to content

Commit ec25bd8

Browse files
committed
Merge tag 'bcachefs-2024-04-03' of https://evilpiepirate.org/git/bcachefs
Pull bcachefs repair code from Kent Overstreet: "A couple more small fixes, and new repair code. We can now automatically recover from arbitrary corrupted interior btree nodes by scanning, and we can reconstruct metadata as needed to bring a filesystem back into a working, consistent, read-write state and preserve access to whatevver wasn't corrupted. Meaning - you can blow away all metadata except for extents and dirents leaf nodes, and repair will reconstruct everything else and give you your data, and under the correct paths. If inodes are missing i_size will be slightly off and permissions/ownership/timestamps will be gone, and we do still need the snapshots btree if snapshots were in use - in the future we'll be able to guess the snapshot tree structure in some situations. IOW - aside from shaking out remaining bugs (fuzz testing is still coming), repair code should be complete and if repair ever doesn't work that's the highest priority bug that I want to know about immediately. This patchset was kindly tested by a user from India who accidentally wiped one drive out of a three drive filesystem with no replication on the family computer - it took a couple weeks but we got everything important back" * tag 'bcachefs-2024-04-03' of https://evilpiepirate.org/git/bcachefs: bcachefs: reconstruct_inode() bcachefs: Subvolume reconstruction bcachefs: Check for extents that point to same space bcachefs: Reconstruct missing snapshot nodes bcachefs: Flag btrees with missing data bcachefs: Topology repair now uses nodes found by scanning to fill holes bcachefs: Repair pass for scanning for btree nodes bcachefs: Don't skip fake btree roots in fsck bcachefs: bch2_btree_root_alloc() -> bch2_btree_root_alloc_fake() bcachefs: Etyzinger cleanups bcachefs: bch2_shoot_down_journal_keys() bcachefs: Clear recovery_passes_required as they complete without errors bcachefs: ratelimit informational fsck errors bcachefs: Check for bad needs_discard before doing discard bcachefs: Improve bch2_btree_update_to_text() mean_and_variance: Drop always failing tests bcachefs: fix nocow lock deadlock bcachefs: BCH_WATERMARK_interior_updates bcachefs: Fix btree node reserve
2 parents c85af71 + 09d4c2a commit ec25bd8

39 files changed

+1869
-494
lines changed

fs/bcachefs/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ bcachefs-y := \
1717
btree_journal_iter.o \
1818
btree_key_cache.o \
1919
btree_locking.o \
20+
btree_node_scan.o \
2021
btree_trans_commit.o \
2122
btree_update.o \
2223
btree_update_interior.o \
@@ -37,6 +38,7 @@ bcachefs-y := \
3738
error.o \
3839
extents.o \
3940
extent_update.o \
41+
eytzinger.o \
4042
fs.o \
4143
fs-common.o \
4244
fs-ioctl.o \

fs/bcachefs/alloc_background.c

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1713,34 +1713,37 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
17131713
if (ret)
17141714
goto out;
17151715

1716-
if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
1717-
a->v.gen++;
1718-
SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1719-
goto write;
1720-
}
1721-
1722-
if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
1723-
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
1724-
bch2_trans_inconsistent(trans,
1725-
"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
1726-
"%s",
1727-
a->v.journal_seq,
1728-
c->journal.flushed_seq_ondisk,
1729-
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
1716+
if (a->v.dirty_sectors) {
1717+
if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1718+
trans, "attempting to discard bucket with dirty data\n%s",
1719+
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
17301720
ret = -EIO;
1731-
}
17321721
goto out;
17331722
}
17341723

17351724
if (a->v.data_type != BCH_DATA_need_discard) {
1736-
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
1737-
bch2_trans_inconsistent(trans,
1738-
"bucket incorrectly set in need_discard btree\n"
1739-
"%s",
1740-
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
1741-
ret = -EIO;
1725+
if (data_type_is_empty(a->v.data_type) &&
1726+
BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
1727+
a->v.gen++;
1728+
SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1729+
goto write;
17421730
}
17431731

1732+
if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1733+
trans, "bucket incorrectly set in need_discard btree\n"
1734+
"%s",
1735+
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1736+
ret = -EIO;
1737+
goto out;
1738+
}
1739+
1740+
if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
1741+
if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1742+
trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
1743+
a->v.journal_seq,
1744+
c->journal.flushed_seq_ondisk,
1745+
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1746+
ret = -EIO;
17441747
goto out;
17451748
}
17461749

@@ -1835,6 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo
18351838
if (ret)
18361839
goto err;
18371840

1841+
BUG_ON(a->v.dirty_sectors);
18381842
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
18391843
a->v.data_type = alloc_data_type(a->v, a->v.data_type);
18401844

@@ -1942,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
19421946
goto out;
19431947

19441948
BUG_ON(a->v.data_type != BCH_DATA_cached);
1949+
BUG_ON(a->v.dirty_sectors);
19451950

19461951
if (!a->v.cached_sectors)
19471952
bch_err(c, "invalidating empty bucket, confused");

fs/bcachefs/alloc_foreground.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
188188
static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
189189
{
190190
switch (watermark) {
191-
case BCH_WATERMARK_reclaim:
191+
case BCH_WATERMARK_interior_updates:
192192
return 0;
193+
case BCH_WATERMARK_reclaim:
194+
return OPEN_BUCKETS_COUNT / 6;
193195
case BCH_WATERMARK_btree:
194196
case BCH_WATERMARK_btree_copygc:
195197
return OPEN_BUCKETS_COUNT / 4;

fs/bcachefs/alloc_types.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ struct bucket_alloc_state {
2222
x(copygc) \
2323
x(btree) \
2424
x(btree_copygc) \
25-
x(reclaim)
25+
x(reclaim) \
26+
x(interior_updates)
2627

2728
enum bch_watermark {
2829
#define x(name) BCH_WATERMARK_##name,

fs/bcachefs/backpointers.c

Lines changed: 166 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "btree_update.h"
99
#include "btree_update_interior.h"
1010
#include "btree_write_buffer.h"
11+
#include "checksum.h"
1112
#include "error.h"
1213

1314
#include <linux/mm.h>
@@ -418,28 +419,113 @@ struct extents_to_bp_state {
418419
struct bkey_buf last_flushed;
419420
};
420421

422+
static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
423+
struct bkey_s_c extent, unsigned dev)
424+
{
425+
struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
426+
int ret = PTR_ERR_OR_ZERO(n);
427+
if (ret)
428+
return ret;
429+
430+
bch2_bkey_drop_device(bkey_i_to_s(n), dev);
431+
return bch2_btree_insert_trans(trans, btree, n, 0);
432+
}
433+
434+
static int check_extent_checksum(struct btree_trans *trans,
435+
enum btree_id btree, struct bkey_s_c extent,
436+
enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
437+
{
438+
struct bch_fs *c = trans->c;
439+
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
440+
const union bch_extent_entry *entry;
441+
struct extent_ptr_decoded p;
442+
struct printbuf buf = PRINTBUF;
443+
void *data_buf = NULL;
444+
struct bio *bio = NULL;
445+
size_t bytes;
446+
int ret = 0;
447+
448+
if (bkey_is_btree_ptr(extent.k))
449+
return false;
450+
451+
bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
452+
if (p.ptr.dev == dev)
453+
goto found;
454+
BUG();
455+
found:
456+
if (!p.crc.csum_type)
457+
return false;
458+
459+
bytes = p.crc.compressed_size << 9;
460+
461+
struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
462+
if (!bch2_dev_get_ioref(ca, READ))
463+
return false;
464+
465+
data_buf = kvmalloc(bytes, GFP_KERNEL);
466+
if (!data_buf) {
467+
ret = -ENOMEM;
468+
goto err;
469+
}
470+
471+
bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL);
472+
bio->bi_iter.bi_sector = p.ptr.offset;
473+
bch2_bio_map(bio, data_buf, bytes);
474+
ret = submit_bio_wait(bio);
475+
if (ret)
476+
goto err;
477+
478+
prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
479+
prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree));
480+
bch2_bkey_val_to_text(&buf, c, extent);
481+
prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
482+
bch2_bkey_val_to_text(&buf, c, extent2);
483+
484+
struct nonce nonce = extent_nonce(extent.k->version, p.crc);
485+
struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
486+
if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
487+
c, dup_backpointer_to_bad_csum_extent,
488+
"%s", buf.buf))
489+
ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
490+
fsck_err:
491+
err:
492+
if (bio)
493+
bio_put(bio);
494+
kvfree(data_buf);
495+
percpu_ref_put(&ca->io_ref);
496+
printbuf_exit(&buf);
497+
return ret;
498+
}
499+
421500
static int check_bp_exists(struct btree_trans *trans,
422501
struct extents_to_bp_state *s,
423502
struct bpos bucket,
424503
struct bch_backpointer bp,
425504
struct bkey_s_c orig_k)
426505
{
427506
struct bch_fs *c = trans->c;
428-
struct btree_iter bp_iter = { NULL };
507+
struct btree_iter bp_iter = {};
508+
struct btree_iter other_extent_iter = {};
429509
struct printbuf buf = PRINTBUF;
430510
struct bkey_s_c bp_k;
431511
struct bkey_buf tmp;
432512
int ret;
433513

434514
bch2_bkey_buf_init(&tmp);
435515

516+
if (!bch2_dev_bucket_exists(c, bucket)) {
517+
prt_str(&buf, "extent for nonexistent device:bucket ");
518+
bch2_bpos_to_text(&buf, bucket);
519+
prt_str(&buf, "\n ");
520+
bch2_bkey_val_to_text(&buf, c, orig_k);
521+
bch_err(c, "%s", buf.buf);
522+
return -BCH_ERR_fsck_repair_unimplemented;
523+
}
524+
436525
if (bpos_lt(bucket, s->bucket_start) ||
437526
bpos_gt(bucket, s->bucket_end))
438527
return 0;
439528

440-
if (!bch2_dev_bucket_exists(c, bucket))
441-
goto missing;
442-
443529
bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
444530
bucket_pos_to_bp(c, bucket, bp.bucket_offset),
445531
0);
@@ -465,21 +551,94 @@ static int check_bp_exists(struct btree_trans *trans,
465551
ret = -BCH_ERR_transaction_restart_write_buffer_flush;
466552
goto out;
467553
}
468-
goto missing;
554+
555+
goto check_existing_bp;
469556
}
470557
out:
471558
err:
472559
fsck_err:
560+
bch2_trans_iter_exit(trans, &other_extent_iter);
473561
bch2_trans_iter_exit(trans, &bp_iter);
474562
bch2_bkey_buf_exit(&tmp, c);
475563
printbuf_exit(&buf);
476564
return ret;
565+
check_existing_bp:
566+
/* Do we have a backpointer for a different extent? */
567+
if (bp_k.k->type != KEY_TYPE_backpointer)
568+
goto missing;
569+
570+
struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;
571+
572+
struct bkey_s_c other_extent =
573+
bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
574+
ret = bkey_err(other_extent);
575+
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
576+
ret = 0;
577+
if (ret)
578+
goto err;
579+
580+
if (!other_extent.k)
581+
goto missing;
582+
583+
if (bch2_extents_match(orig_k, other_extent)) {
584+
printbuf_reset(&buf);
585+
prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
586+
bch2_bkey_val_to_text(&buf, c, orig_k);
587+
prt_str(&buf, "\n ");
588+
bch2_bkey_val_to_text(&buf, c, other_extent);
589+
bch_err(c, "%s", buf.buf);
590+
591+
if (other_extent.k->size <= orig_k.k->size) {
592+
ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
593+
if (ret)
594+
goto err;
595+
goto out;
596+
} else {
597+
ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
598+
if (ret)
599+
goto err;
600+
goto missing;
601+
}
602+
}
603+
604+
ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
605+
if (ret < 0)
606+
goto err;
607+
if (ret) {
608+
ret = 0;
609+
goto missing;
610+
}
611+
612+
ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
613+
if (ret < 0)
614+
goto err;
615+
if (ret) {
616+
ret = 0;
617+
goto out;
618+
}
619+
620+
printbuf_reset(&buf);
621+
prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode);
622+
bch2_bkey_val_to_text(&buf, c, orig_k);
623+
prt_str(&buf, "\n ");
624+
bch2_bkey_val_to_text(&buf, c, other_extent);
625+
bch_err(c, "%s", buf.buf);
626+
ret = -BCH_ERR_fsck_repair_unimplemented;
627+
goto err;
477628
missing:
629+
printbuf_reset(&buf);
478630
prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
479631
bch2_btree_id_str(bp.btree_id), bp.level);
480632
bch2_bkey_val_to_text(&buf, c, orig_k);
481-
prt_printf(&buf, "\nbp pos ");
482-
bch2_bpos_to_text(&buf, bp_iter.pos);
633+
prt_printf(&buf, "\n got: ");
634+
bch2_bkey_val_to_text(&buf, c, bp_k);
635+
636+
struct bkey_i_backpointer n_bp_k;
637+
bkey_backpointer_init(&n_bp_k.k_i);
638+
n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
639+
n_bp_k.v = bp;
640+
prt_printf(&buf, "\n want: ");
641+
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
483642

484643
if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
485644
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);

fs/bcachefs/bcachefs.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,7 @@ enum bch_time_stats {
456456

457457
#include "alloc_types.h"
458458
#include "btree_types.h"
459+
#include "btree_node_scan_types.h"
459460
#include "btree_write_buffer_types.h"
460461
#include "buckets_types.h"
461462
#include "buckets_waiting_for_journal_types.h"
@@ -614,6 +615,7 @@ struct bch_dev {
614615
*/
615616

616617
#define BCH_FS_FLAGS() \
618+
x(new_fs) \
617619
x(started) \
618620
x(may_go_rw) \
619621
x(rw) \
@@ -796,6 +798,7 @@ struct bch_fs {
796798
u64 features;
797799
u64 compat;
798800
unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
801+
u64 btrees_lost_data;
799802
} sb;
800803

801804

@@ -1103,6 +1106,8 @@ struct bch_fs {
11031106
struct journal_keys journal_keys;
11041107
struct list_head journal_iters;
11051108

1109+
struct find_btree_nodes found_btree_nodes;
1110+
11061111
u64 last_bucket_seq_cleanup;
11071112

11081113
u64 counters_on_mount[BCH_COUNTER_NR];

fs/bcachefs/bcachefs_format.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,7 @@ struct bch_sb_field_ext {
818818
struct bch_sb_field field;
819819
__le64 recovery_passes_required[2];
820820
__le64 errors_silent[8];
821+
__le64 btrees_lost_data;
821822
};
822823

823824
struct bch_sb_field_downgrade_entry {

0 commit comments

Comments
 (0)